gammo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.travis.yml +6 -0
  4. data/Gemfile +9 -0
  5. data/Gemfile.lock +27 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +177 -0
  8. data/Rakefile +25 -0
  9. data/gammo.gemspec +23 -0
  10. data/lib/gammo.rb +15 -0
  11. data/lib/gammo/attribute.rb +17 -0
  12. data/lib/gammo/fragment_parser.rb +65 -0
  13. data/lib/gammo/node.rb +157 -0
  14. data/lib/gammo/parser.rb +524 -0
  15. data/lib/gammo/parser/constants.rb +94 -0
  16. data/lib/gammo/parser/foreign.rb +307 -0
  17. data/lib/gammo/parser/insertion_mode.rb +74 -0
  18. data/lib/gammo/parser/insertion_mode/after_after_body.rb +36 -0
  19. data/lib/gammo/parser/insertion_mode/after_after_frameset.rb +32 -0
  20. data/lib/gammo/parser/insertion_mode/after_body.rb +46 -0
  21. data/lib/gammo/parser/insertion_mode/after_frameset.rb +39 -0
  22. data/lib/gammo/parser/insertion_mode/after_head.rb +70 -0
  23. data/lib/gammo/parser/insertion_mode/before_head.rb +49 -0
  24. data/lib/gammo/parser/insertion_mode/before_html.rb +45 -0
  25. data/lib/gammo/parser/insertion_mode/in_body.rb +463 -0
  26. data/lib/gammo/parser/insertion_mode/in_caption.rb +47 -0
  27. data/lib/gammo/parser/insertion_mode/in_cell.rb +46 -0
  28. data/lib/gammo/parser/insertion_mode/in_column_group.rb +66 -0
  29. data/lib/gammo/parser/insertion_mode/in_frameset.rb +48 -0
  30. data/lib/gammo/parser/insertion_mode/in_head.rb +98 -0
  31. data/lib/gammo/parser/insertion_mode/in_head_noscript.rb +52 -0
  32. data/lib/gammo/parser/insertion_mode/in_row.rb +53 -0
  33. data/lib/gammo/parser/insertion_mode/in_select.rb +77 -0
  34. data/lib/gammo/parser/insertion_mode/in_select_in_table.rb +46 -0
  35. data/lib/gammo/parser/insertion_mode/in_table.rb +114 -0
  36. data/lib/gammo/parser/insertion_mode/in_table_body.rb +55 -0
  37. data/lib/gammo/parser/insertion_mode/in_template.rb +80 -0
  38. data/lib/gammo/parser/insertion_mode/initial.rb +152 -0
  39. data/lib/gammo/parser/insertion_mode/text.rb +32 -0
  40. data/lib/gammo/parser/insertion_mode_stack.rb +8 -0
  41. data/lib/gammo/parser/node_stack.rb +24 -0
  42. data/lib/gammo/tags.rb +9 -0
  43. data/lib/gammo/tags/table.rb +744 -0
  44. data/lib/gammo/tokenizer.rb +373 -0
  45. data/lib/gammo/tokenizer/debug.rb +34 -0
  46. data/lib/gammo/tokenizer/entity.rb +2240 -0
  47. data/lib/gammo/tokenizer/escape.rb +174 -0
  48. data/lib/gammo/tokenizer/script_scanner.rb +229 -0
  49. data/lib/gammo/tokenizer/tokens.rb +66 -0
  50. data/lib/gammo/version.rb +3 -0
  51. data/misc/html.yaml +384 -0
  52. data/misc/table.erubi +14 -0
  53. metadata +97 -0
@@ -0,0 +1,524 @@
1
+ require 'delegate'
2
+ require 'gammo/node'
3
+ require 'gammo/tags'
4
+ require 'gammo/tokenizer'
5
+ require 'gammo/parser/node_stack'
6
+ require 'gammo/parser/foreign'
7
+ require 'gammo/parser/constants'
8
+ require 'gammo/parser/insertion_mode_stack'
9
+
10
+ module Gammo
11
+ # Class for parsing an HTML input and building an HTML tree.
12
+ class Parser
13
+ require 'gammo/parser/insertion_mode'
14
+
15
+ include Foreign
16
+ include Constants
17
+
18
+ # Raised if anything goes wrong while parsing an HTML.
19
+ ParseError = Class.new(ArgumentError)
20
+
21
+ # Default scope stop tags defined in 12.2.4.2.
22
+ # https://html.spec.whatwg.org/multipage/parsing.html#the-stack-of-open-elements
23
+ # @!visibility private
24
+ DEFAULT_SCOPE_STOP_TAGS = {
25
+ nil => [Tags::Applet, Tags::Caption, Tags::Html, Tags::Table, Tags::Td,
26
+ Tags::Th, Tags::Marquee, Tags::Object, Tags::Template],
27
+ 'math' => [Tags::AnnotationXml, Tags::Mi, Tags::Mn, Tags::Mo, Tags::Ms,
28
+ Tags::Mtext],
29
+ 'svg' => [Tags::Desc, Tags::ForeignObject, Tags::Title]
30
+ }.freeze
31
+
32
+ # Scope constants defined in 12.2.4.2.
33
+ # https://html.spec.whatwg.org/multipage/parsing.html#the-stack-of-open-elements
34
+
35
+ # @!visibility private
36
+ DEFAULT_SCOPE = 0
37
+
38
+ # @!visibility private
39
+ LIST_ITEM_SCOPE = 1
40
+
41
+ # @!visibility private
42
+ BUTTON_SCOPE = 2
43
+
44
+ # @!visibility private
45
+ TABLE_SCOPE = 3
46
+
47
+ # @!visibility private
48
+ TABLE_ROW_SCOPE = 4
49
+
50
+ # @!visibility private
51
+ TABLE_BODY_SCOPE = 5
52
+
53
+ # @!visibility private
54
+ SELECT_SCOPE = 6
55
+
56
+ # Tokenizer for parsing each token.
57
+ # @!visibility private
58
+ attr_accessor :tokenizer, :token
59
+
60
+ # The insertion mode is a state variable that controls the primary operation
61
+ # of the tree construction stage.
62
+ # https://html.spec.whatwg.org/multipage/parsing.html#the-insertion-mode
63
+ # @!visibility private
64
+ attr_accessor :insertion_mode
65
+
66
+ # The original insertion mode is set to this accessor, defined in 12.2.4.1.
67
+ # When the insertion mode is switched to "text" or "in table text",
68
+ # the original insertion mode is also set. This is the insertion mode to
69
+ # which the tree construction stage will return.
70
+ # https://html.spec.whatwg.org/multipage/parsing.html#the-insertion-mode
71
+ # @!visibility private
72
+ attr_accessor :original_insertion_mode
73
+
74
+ # `template_stack` represents the stack of template insertion modes.
75
+ # Defined in 12.4.2.1.
76
+ # @!visibility private
77
+ attr_accessor :template_stack
78
+
79
+ # The stack of open elements, defined in 12.2.4.2.
80
+ # https://html.spec.whatwg.org/multipage/parsing.html#the-stack-of-open-elements
81
+ # @!visibility private
82
+ attr_accessor :open_elements
83
+
84
+ # The list of active formatting elements defined in 12.2.4.3.
85
+ # https://html.spec.whatwg.org/multipage/parsing.html#the-list-of-active-formatting-elements
86
+ # @!visibility private
87
+ attr_accessor :active_formatting_elements
88
+
89
+ # The element pointers defined in 12.2.4.4.
90
+ # https://html.spec.whatwg.org/multipage/parsing.html#other-parsing-state-flags
91
+ attr_accessor :head, :form
92
+
93
+ # Other parsing state flags defined in 12.2.4.5.
94
+ # https://html.spec.whatwg.org/multipage/parsing.html#other-parsing-state-flags
95
+ attr_accessor :scripting, :frameset_ok
96
+ alias_method :scripting?, :scripting
97
+ alias_method :frameset_ok?, :frameset_ok
98
+
99
+ # Document root element
100
+ attr_accessor :document
101
+
102
+ # Self-closing flag defined in 12.2.5.
103
+ # Self-closing tags like <img /> are treated as start tag token, except
104
+ # `has_self_closing_token` is set while they are being proceeded.
105
+ # @!visibility private
106
+ attr_accessor :has_self_closing_token
107
+
108
+ # Quirks flag is defined in 12.2.5.
109
+ # quirks flag is whether the parser is operating in the "force-quirks" mode.
110
+ # @!visibility private
111
+ attr_accessor :quirks
112
+
113
+ # `foster_parenting` is set to true if a new element should be inserted
114
+ # according to the foster parenting rule defined in 12.2.6.1.
115
+ # https://html.spec.whatwg.org/multipage/parsing.html#creating-and-inserting-nodes
116
+ # @!visibility private
117
+ attr_accessor :foster_parenting
118
+
119
+ # The context element is for use in parsing an HTML fragment, defined in
120
+ # 12.2.4.2.
121
+ # https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
122
+ attr_accessor :context
123
+
124
+ # `input` is the original input text.
125
+ # @!visibility private
126
+ attr_reader :input
127
+
128
+ # Constructs a parser for parsing an HTML input.
129
+ # @param [String] input
130
+ # @param [TrueClass, FalseClass] scripting
131
+ # @param [TrueClass, FalseClass] frameset_ok
132
+ # @param [InsertionMode] insertion_mode
133
+ # @param [Gammo::Node] context
134
+ # @return [Gammo::Parser]
135
+ def initialize(input, scripting: true, frameset_ok: true, insertion_mode: Initial, context: nil)
136
+ @input = input
137
+ @scripting = scripting
138
+ @frameset_ok = frameset_ok
139
+ @context = context
140
+ @insertion_mode = insertion_mode
141
+ @token = nil
142
+ @tokenizer = Tokenizer.new(input)
143
+ @document = Node::Document.new
144
+ @open_elements = Parser::NodeStack.new([])
145
+ @active_formatting_elements = Parser::NodeStack.new([])
146
+ @template_stack = InsertionModeStack.new([])
147
+ @foster_parenting = false
148
+ @has_self_closing_token = false
149
+ @quirks = false
150
+ @form = nil
151
+ @head = nil
152
+ end
153
+
154
+ # Parses the current input and builds HTML tree from it.
155
+ # @raise [Gammo::ParseError] Raised if the parser gets error while parsing.
156
+ # @return [Gammo::Node::Document, nil]
157
+ def parse
158
+ while self.token != Tokenizer::EOS
159
+ # CDATA sections are allowed only in foreign content.
160
+ node = open_elements.last
161
+ tokenizer.allow_cdata!(node && node.namespace)
162
+ self.token = tokenizer.next_token
163
+ return if self.token.instance_of?(Tokenizer::ErrorToken) && self.token != Tokenizer::EOS
164
+ parse_current_token
165
+ break if self.token == Tokenizer::EOS
166
+ end
167
+ self.document
168
+ end
169
+
170
+ # Always returns false.
171
+ # @return [FalseClass]
172
+ # @!visibility private
173
+ def fragment?
174
+ false
175
+ end
176
+
177
+ # Returns true if given node is matched with any special elements
178
+ # defined in 12.2.4.2.
179
+ # https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements
180
+ #
181
+ # @param [Gammo::Node] node
182
+ # @return [TrueClass, FalseClass]
183
+ # @see Gammo::Parser::Constants::SPECIAL_ELEMENTS
184
+ # @!visibility private
185
+ def special_element?(node)
186
+ case node.namespace
187
+ when nil, 'html'
188
+ SPECIAL_ELEMENTS[node.data]
189
+ when 'math'
190
+ case node.data
191
+ when 'mi', 'mo', 'mn', 'ms', 'mtext', 'annotation-xml'
192
+ true
193
+ end
194
+ when 'svg'
195
+ case node.data
196
+ when 'foreignObject', 'desc', 'title'
197
+ true
198
+ end
199
+ end
200
+ end
201
+
202
+ # @!visibility private
203
+ def parse_implied_token(tok, tag, data)
204
+ real_token, self_closing = token, has_self_closing_token
205
+ @token = tok.new(data, tag: tag)
206
+ @has_self_closing_token = false
207
+ parse_current_token
208
+ @token, @has_self_closing_token = real_token, self_closing
209
+ end
210
+
211
+ # @!visibility private
212
+ def pop_until(scope, *match_tags)
213
+ index = index_of_element_in_scope(scope, *match_tags)
214
+ if index != -1
215
+ @open_elements = open_elements.slice(0, index)
216
+ return true
217
+ end
218
+ false
219
+ end
220
+
221
+ # @!visibility private
222
+ def index_of_element_in_scope(scope, *match_tags)
223
+ open_elements.reverse_each_with_index do |open_element, index|
224
+ tag = open_element.tag
225
+ unless open_element.namespace
226
+ return index if match_tags.include?(tag)
227
+ case scope
228
+ when DEFAULT_SCOPE
229
+ # no op
230
+ when LIST_ITEM_SCOPE
231
+ return -1 if tag == Tags::Ol || tag == Tags::Ul
232
+ when BUTTON_SCOPE
233
+ return -1 if tag == Tags::Button
234
+ when TABLE_SCOPE
235
+ return -1 if tag == Tags::Html || tag == Tags::Table || tag == Tags::Template
236
+ when SELECT_SCOPE
237
+ return -1 if tag == Tags::Optgroup && tag == Tags::Option
238
+ else
239
+ raise ParseError, 'unreachable parsing error, please report to github'
240
+ end
241
+ end
242
+ case scope
243
+ when DEFAULT_SCOPE, LIST_ITEM_SCOPE, BUTTON_SCOPE
244
+ return -1 if DEFAULT_SCOPE_STOP_TAGS[open_element.namespace].include?(tag)
245
+ end
246
+ end
247
+ -1
248
+ end
249
+
250
+ # @!visibility private
251
+ def parse_generic_raw_text_element
252
+ add_element
253
+ @original_insertion_mode = insertion_mode
254
+ @insertion_mode = Text
255
+ end
256
+
257
+ # 12.2.4.2
258
+ # @!visibility private
259
+ def adjusted_current_node
260
+ return context if open_elements.length == 1 && fragment? && context
261
+ open_elements.last
262
+ end
263
+
264
+ # @!visibility private
265
+ def element_in_scope?(scope, *match_tags)
266
+ index_of_element_in_scope(scope, *match_tags) != -1
267
+ end
268
+
269
+ # @!visibility private
270
+ def clear_stack_to_context(scope)
271
+ open_elements.reverse_each_with_index do |open_element, index|
272
+ tag = open_element.tag
273
+ case scope
274
+ when TABLE_SCOPE
275
+ if tag == Tags::Html || tag == Tags::Table || tag == Tags::Template
276
+ @open_elements = open_elements.slice(0, index + 1)
277
+ return
278
+ end
279
+ when TABLE_ROW_SCOPE
280
+ if tag == Tags::Html || tag == Tags::Tr || tag == Tags::Template
281
+ @open_elements = open_elements.slice(0, index + 1)
282
+ return
283
+ end
284
+ when TABLE_BODY_SCOPE
285
+ if tag == Tags::Html || tag == Tags::Tbody || tag == Tags::Tfoot || tag == Tags::Thead || tag == Tags::Template
286
+ @open_elements = open_elements.slice(0, index + 1)
287
+ return
288
+ end
289
+ else
290
+ raise ParseError, 'unreachable parsing error, please report to github'
291
+ end
292
+ end
293
+ end
294
+
295
+ # @!visibility private
296
+ def generate_implied_end_tags(*exceptions)
297
+ index = open_elements.reverse_each_with_index do |node, i|
298
+ break index unless node.instance_of? Node::Element
299
+ case node.tag
300
+ when Tags::Dd, Tags::Dt, Tags::Optgroup, Tags::Option, Tags::P, Tags::Rb, Tags::Rp, Tags::Rt, Tags::Rtc
301
+ break i if exceptions.include?(node.data)
302
+ next
303
+ end
304
+ break i
305
+ end
306
+ @open_elements = open_elements.slice(0, index + 1)
307
+ end
308
+
309
+ # @!visibility private
310
+ def add_child(node)
311
+ should_foster_parent? ? foster_parent(node) : top.append_child(node)
312
+ open_elements << node if node.instance_of?(Node::Element)
313
+ end
314
+
315
+ # @!visibility private
316
+ def top
317
+ open_elements.last || document
318
+ end
319
+
320
+ # @!visibility private
321
+ def add_element
322
+ add_child(Node::Element.new(tag: token.tag, data: token.data, attributes: token.attributes))
323
+ end
324
+
325
+ # @!visibility private
326
+ def should_foster_parent?
327
+ return false unless foster_parenting
328
+ case top.tag
329
+ when Tags::Table, Tags::Tbody, Tags::Tfoot, Tags::Thead, Tags::Tr
330
+ return true
331
+ end
332
+ false
333
+ end
334
+
335
+ # @!visibility private
336
+ def foster_parent(node)
337
+ i = 0
338
+ table = open_elements.reverse_each_with_index do |open_element, index|
339
+ if open_element.tag == Tags::Table
340
+ i = index
341
+ break open_element
342
+ end
343
+ end
344
+ j = 0
345
+ template = open_elements.reverse_each_with_index do |open_element, index|
346
+ if open_element.tag == Tags::Template
347
+ j = index
348
+ break open_element
349
+ end
350
+ end
351
+ return template.append_child(node) if template && (!table || j > i)
352
+ parent = table ? table.parent : open_elements[0]
353
+ parent = open_elements[i - 1] unless parent
354
+ prev = table ? table.previous_sibling : parent.last_child
355
+ if prev && prev.instance_of?(Node::Text) && node.instance_of?(Node::Text)
356
+ prev.data += node.data
357
+ return
358
+ end
359
+ parent.insert_before(node, table)
360
+ end
361
+
362
+ # @!visibility private
363
+ def add_text(text)
364
+ return if text.empty?
365
+ return foster_parent(Node::Text.new(data: text)) if should_foster_parent?
366
+ t = top
367
+ node = t.last_child
368
+ if node && node.instance_of?(Node::Text)
369
+ node.data += text
370
+ return
371
+ end
372
+ add_child Node::Text.new(data: text)
373
+ end
374
+
375
+ # @!visibility private
376
+ def add_formatting_element
377
+ tag, attrs = token.tag, token.attributes
378
+ add_element
379
+ identical_elements = 0
380
+ # todo
381
+ continued_finding = false
382
+ active_formatting_elements.reverse_each_with_index do |node, i|
383
+ continued_finding = false
384
+ break if node.instance_of?(Node::ScopeMarker)
385
+ next unless node.instance_of?(Node::Element)
386
+ next if node.namespace || node.tag != tag || node.attributes.length != attrs.length
387
+ # compare attrs
388
+ node.attributes.each.with_index do |a, j|
389
+ continue_comparing = false
390
+ attrs.each_with_index do |b, k|
391
+ if a.key == b.key && a.namespace == b.namespace && a.value == b.value
392
+ continue_comparing = true
393
+ break
394
+ end
395
+ end
396
+ next if continue_comparing
397
+ continued_finding = true
398
+ break if continued_finding
399
+ end
400
+ next if continued_finding
401
+ identical_elements += 1
402
+ active_formatting_elements.delete(node) if identical_elements >= 3
403
+ end
404
+
405
+ active_formatting_elements << open_elements.last
406
+ end
407
+
408
+ # @!visibility private
409
+ def clear_active_formatting_elements
410
+ loop do
411
+ node = active_formatting_elements.pop
412
+ return if active_formatting_elements.length.zero? || node.instance_of?(Node::ScopeMarker)
413
+ end
414
+ end
415
+
416
+ # @!visibility private
417
+ def reconstruct_active_formatting_elements
418
+ return unless node = active_formatting_elements.last
419
+ return if node.instance_of?(Node::ScopeMarker) || open_elements.index(node)
420
+ i = active_formatting_elements.length - 1
421
+ until node.is_a?(Node::ScopeMarker) || open_elements.index(node)
422
+ if i.zero?
423
+ i = -1
424
+ break
425
+ end
426
+ i -= 1
427
+ node = active_formatting_elements[i]
428
+ end
429
+ loop do
430
+ i += 1
431
+ cloned = active_formatting_elements[i].clone
432
+ add_child(cloned)
433
+ active_formatting_elements[i] = cloned
434
+ break if i == active_formatting_elements.length - 1
435
+ end
436
+ end
437
+
438
+ # @!visibility private
439
+ def acknowledge_self_closing_tag
440
+ @has_self_closing_token = false
441
+ end
442
+
443
+ # @!visibility private
444
+ def set_original_insertion_mode
445
+ raise 'bad parser state: original im was set twice' if original_insertion_mode
446
+ @original_insertion_mode = @insertion_mode
447
+ end
448
+
449
+ # @!visibility private
450
+ def reset_insertion_mode
451
+ open_elements.reverse_each_with_index do |open_element, index|
452
+ node = open_element
453
+ last = index.zero?
454
+ node = self.context if last && self.context
455
+ case node.tag
456
+ when Tags::Select
457
+ unless last
458
+ ancestor = node
459
+ first = open_elements[0]
460
+ while ancestor != first
461
+ ancestor = open_elements[open_elements.index(ancestor) - 1]
462
+ case ancestor.tag
463
+ when Tags::Template
464
+ @insertion_mode = InSelect
465
+ return
466
+ when Tags::Table
467
+ @insertion_mode = InSelectInTable
468
+ return
469
+ end
470
+ end
471
+ end
472
+ @insertion_mode = InSelect
473
+ when Tags::Td, Tags::Th
474
+ # remove this divergence from the HTML5 spec.
475
+ @insertion_mode = InCell
476
+ when Tags::Tr
477
+ @insertion_mode = InRow
478
+ when Tags::Tbody, Tags::Thead, Tags::Tfoot
479
+ @insertion_mode = InTableBody
480
+ when Tags::Caption
481
+ @insertion_mode = InCaption
482
+ when Tags::Colgroup
483
+ @insertion_mode = InColumnGroup
484
+ when Tags::Table
485
+ @insertion_mode = InTable
486
+ when Tags::Template
487
+ # remove this divergence from the HTML5 spec.
488
+ next if node.namespace
489
+ @insertion_mode = template_stack.last
490
+ when Tags::Head
491
+ # remove this divergence from the HTML5 spec.
492
+ @insertion_mode = InHead
493
+ when Tags::Body
494
+ @insertion_mode = InBody
495
+ when Tags::Frameset
496
+ @insertion_mode = InFrameset
497
+ when Tags::Html
498
+ @insertion_mode = @head ? AfterHead : BeforeHead
499
+ else
500
+ if last
501
+ @insertion_mode = InBody
502
+ return
503
+ end
504
+ next
505
+ end
506
+ return
507
+ end
508
+ end
509
+
510
+ # @!visibility private
511
+ def parse_current_token
512
+ if token.instance_of? Tokenizer::SelfClosingTagToken
513
+ self.has_self_closing_token = true
514
+ self.token = Tokenizer::StartTagToken.new(token.data, tag: token.tag, attributes: token.attributes)
515
+ end
516
+ consumed = false
517
+ until consumed
518
+ consumed =
519
+ in_foreign_content? ? parse_foreign_content : insertion_mode.new(self).process
520
+ end
521
+ self.has_self_closing_token = false if self.has_self_closing_token
522
+ end
523
+ end
524
+ end