syntax_tree-css 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,1188 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SyntaxTree
4
+ module CSS
5
+ # Parses CSS3 stylesheets according to https://www.w3.org/TR/css-syntax-3
6
+ # from the version dated 24 December 2021.
7
+ class Parser
8
+ # Represents any kind of error that occurs during parsing.
9
+ class ParseError < StandardError
10
+ end
11
+
12
+ # This is used to communicate between the various tokenization algorithms.
13
+ # It transports a value along with the new index.
14
+ class State
15
+ attr_reader :value, :index
16
+
17
+ def initialize(value, index)
18
+ @value = value
19
+ @index = index
20
+ end
21
+ end
22
+
23
+ # https://www.w3.org/TR/css-syntax-3/#digit
24
+ DIGIT = "[0-9]"
25
+
26
+ # https://www.w3.org/TR/css-syntax-3/#uppercase-letter
27
+ UPPERCASE_LETTER = "[A-Z]"
28
+
29
+ # https://www.w3.org/TR/css-syntax-3/#lowercase-letter
30
+ LOWERCASE_LETTER = "[a-z]"
31
+
32
+ # https://www.w3.org/TR/css-syntax-3/#letter
33
+ LETTER = "[#{UPPERCASE_LETTER}#{LOWERCASE_LETTER}]"
34
+
35
+ # https://www.w3.org/TR/css-syntax-3/#non-ascii-code-point
36
+ NONASCII = "[\u{80}-\u{10FFFF}]"
37
+
38
+ # https://www.w3.org/TR/css-syntax-3/#ident-start-code-point
39
+ IDENT_START = "[#{LETTER}#{NONASCII}_]"
40
+
41
+ # https://www.w3.org/TR/css-syntax-3/#ident-code-point
42
+ IDENT = "[#{IDENT_START}#{DIGIT}-]"
43
+
44
+ # https://www.w3.org/TR/css-syntax-3/#non-printable-code-point
45
+ NON_PRINTABLE = "[\x00-\x08\x0B\x0E-\x1F\x7F]"
46
+
47
+ # https://www.w3.org/TR/css-syntax-3/#whitespace
48
+ WHITESPACE = "[\n\t ]"
49
+
50
+ attr_reader :source, :errors
51
+
52
+ def initialize(source)
53
+ @source = preprocess(source)
54
+ @errors = []
55
+ end
56
+
57
+ def error?
58
+ errors.any?
59
+ end
60
+
61
+ #-------------------------------------------------------------------------
62
+ # 5.3. Parser Entry Points
63
+ # https://www.w3.org/TR/css-syntax-3/#parser-entry-points
64
+ #-------------------------------------------------------------------------
65
+
66
+ # 5.3.1. Parse something according to a CSS grammar
67
+ # https://www.w3.org/TR/css-syntax-3/#parse-grammar
68
+ def parse(grammar: :stylesheet)
69
+ case grammar
70
+ in :stylesheet
71
+ parse_css_stylesheet
72
+ else
73
+ raise ArgumentError, "Unsupported grammar: #{grammar}"
74
+ end
75
+ end
76
+
77
+ # 5.3.3. Parse a stylesheet
78
+ # https://www.w3.org/TR/css-syntax-3/#parse-stylesheet
79
+ def parse_stylesheet
80
+ tokens = tokenize
81
+ rules = consume_rule_list(tokens, top_level: true)
82
+
83
+ location =
84
+ if rules.any?
85
+ rules.first.location.to(rules.last.location)
86
+ else
87
+ tokens.reverse_each.first.location
88
+ end
89
+
90
+ StyleSheet.new(rules: rules, location: location)
91
+ end
92
+
93
+ # 5.3.4. Parse a list of rules
94
+ # https://www.w3.org/TR/css-syntax-3/#parse-list-of-rules
95
+ def parse_rule_list
96
+ consume_rule_list(tokenize, top_level: false)
97
+ end
98
+
99
+ # 5.3.5. Parse a rule
100
+ # https://www.w3.org/TR/css-syntax-3/#parse-rule
101
+ def parse_rule
102
+ # 1.
103
+ tokens = tokenize
104
+
105
+ # 2.
106
+ loop do
107
+ case tokens.peek
108
+ in CommentToken | WhitespaceToken
109
+ tokens.next
110
+ else
111
+ break
112
+ end
113
+ end
114
+
115
+ # 3.
116
+ rule = nil
117
+
118
+ case tokens.peek
119
+ in EOFToken
120
+ return ParseError.new("Unexpected end of input parsing rule")
121
+ in AtKeywordToken
122
+ rule = consume_at_rule(tokens)
123
+ else
124
+ rule = consume_qualified_rule(tokens)
125
+ return ParseError.new("Expected a rule at #{tokens.peek.location.start_char}") unless rule
126
+ end
127
+
128
+ # 4.
129
+ loop do
130
+ case tokens.peek
131
+ in CommentToken | WhitespaceToken
132
+ tokens.next
133
+ else
134
+ break
135
+ end
136
+ end
137
+
138
+ # 5.
139
+ case tokens.peek
140
+ in EOFToken
141
+ rule
142
+ else
143
+ ParseError.new("Expected end of input parsing rule")
144
+ end
145
+ end
146
+
147
+ # 5.3.6. Parse a declaration
148
+ # https://www.w3.org/TR/css-syntax-3/#parse-declaration
149
+ def parse_declaration
150
+ # 1.
151
+ tokens = tokenize
152
+
153
+ # 2.
154
+ loop do
155
+ case tokens.peek
156
+ in CommentToken | WhitespaceToken
157
+ tokens.next
158
+ else
159
+ break
160
+ end
161
+ end
162
+
163
+ # 3.
164
+ case tokens.peek
165
+ in IdentToken
166
+ # do nothing
167
+ in EOFToken
168
+ return ParseError.new("Unexpected end of input parsing declaration")
169
+ else
170
+ return ParseError.new("Expected an identifier at #{tokens.peek.location.start_char}")
171
+ end
172
+
173
+ # 4.
174
+ if (declaration = consume_declaration(tokens))
175
+ declaration
176
+ else
177
+ ParseError.new("Expected a declaration at #{tokens.peek.location.start_char}")
178
+ end
179
+ end
180
+
181
+ # 5.3.8. Parse a list of declarations
182
+ # https://www.w3.org/TR/css-syntax-3/#parse-list-of-declarations
183
+ def parse_declaration_list
184
+ consume_declaration_list(tokenize)
185
+ end
186
+
187
+ # 5.3.9. Parse a component value
188
+ # https://www.w3.org/TR/css-syntax-3/#parse-component-value
189
+ def parse_component_value
190
+ # 1.
191
+ tokens = tokenize
192
+
193
+ # 2.
194
+ loop do
195
+ case tokens.peek
196
+ in CommentToken | WhitespaceToken
197
+ tokens.next
198
+ else
199
+ break
200
+ end
201
+ end
202
+
203
+ # 3.
204
+ if tokens.peek.is_a?(EOFToken)
205
+ return ParseError.new("Unexpected end of input parsing component value")
206
+ end
207
+
208
+ # 4.
209
+ value = consume_component_value(tokens)
210
+
211
+ # 5.
212
+ loop do
213
+ case tokens.peek
214
+ in CommentToken | WhitespaceToken
215
+ tokens.next
216
+ else
217
+ break
218
+ end
219
+ end
220
+
221
+ # 6.
222
+ if tokens.peek.is_a?(EOFToken)
223
+ value
224
+ else
225
+ ParseError.new("Expected end of input parsing component value")
226
+ end
227
+ end
228
+
229
+ # 5.3.10. Parse a list of component values
230
+ # https://www.w3.org/TR/css-syntax-3/#parse-list-of-component-values
231
+ def parse_component_values
232
+ tokens = tokenize
233
+ values = []
234
+
235
+ values << consume_component_value(tokens) until tokens.peek.is_a?(EOFToken)
236
+ values
237
+ end
238
+
239
+ private
240
+
241
+ #-------------------------------------------------------------------------
242
+ # 3. Tokenizing and Parsing CSS
243
+ # https://www.w3.org/TR/css-syntax-3/#tokenizing-and-parsing
244
+ #-------------------------------------------------------------------------
245
+
246
+ # 3.3. Preprocessing the input stream
247
+ # https://www.w3.org/TR/css-syntax-3/#input-preprocessing
248
+ def preprocess(input)
249
+ input.gsub(/\r\n?|\f/, "\n").gsub(/\x00/, "\u{FFFD}")
250
+
251
+ # We should also be replacing surrogate characters in the input stream
252
+ # with the replacement character, but it's not entirely possible to do
253
+ # that if the string is already UTF-8 encoded. Until we dive further
254
+ # into encoding and handle fallback encodings, we'll just skip this.
255
+ # .gsub(/[\u{D800}-\u{DFFF}]/, "\u{FFFD}")
256
+ end
257
+
258
+ #-------------------------------------------------------------------------
259
+ # 4. Tokenization
260
+ # https://www.w3.org/TR/css-syntax-3/#tokenization
261
+ #-------------------------------------------------------------------------
262
+
263
+ # Create an enumerator of tokens from the source.
264
+ def tokenize
265
+ Enumerator.new do |enum|
266
+ index = 0
267
+
268
+ while index < source.length
269
+ state = consume_token(index)
270
+
271
+ enum << state.value
272
+ index = state.index
273
+ end
274
+
275
+ enum << EOFToken[index]
276
+ end
277
+ end
278
+
279
+ # 4.3.1. Consume a token
280
+ # https://www.w3.org/TR/css-syntax-3/#consume-token
281
+ def consume_token(index)
282
+ case source[index..]
283
+ when %r{\A/\*}
284
+ consume_comment(index)
285
+ when /\A#{WHITESPACE}+/o
286
+ State.new(WhitespaceToken.new(value: $&, location: index...(index + $&.length)), index + $&.length)
287
+ when /\A["']/
288
+ consume_string(index, $&)
289
+ when /\A#/
290
+ if ident?(source[index + 1]) || valid_escape?(source[index + 1], source[index + 2])
291
+ state = consume_ident_sequence(index + 1)
292
+
293
+ State.new(
294
+ HashToken.new(
295
+ value: state.value,
296
+ type: start_ident_sequence?(index + 1) ? "id" : "unrestricted",
297
+ location: index...state.index
298
+ ),
299
+ state.index
300
+ )
301
+ else
302
+ State.new(DelimToken.new(value: "#", location: index...(index + 1)), index + 1)
303
+ end
304
+ when /\A\(/
305
+ State.new(OpenParenToken.new(location: index...(index + 1)), index + 1)
306
+ when /\A\)/
307
+ State.new(CloseParenToken.new(location: index...(index + 1)), index + 1)
308
+ when /\A\+/
309
+ if start_number?(index + 1)
310
+ consume_numeric(index)
311
+ else
312
+ State.new(DelimToken.new(value: "+", location: index...(index + 1)), index + 1)
313
+ end
314
+ when /\A,/
315
+ State.new(CommaToken.new(location: index...(index + 1)), index + 1)
316
+ when /\A-/
317
+ if start_number?(index)
318
+ consume_numeric(index)
319
+ elsif source[index + 1] == "-" && source[index + 2] == ">"
320
+ State.new(CDCToken.new(location: index...(index + 3)), index + 3)
321
+ elsif start_ident_sequence?(index)
322
+ consume_ident_like(index)
323
+ else
324
+ State.new(DelimToken.new(value: "-", location: index...(index + 1)), index + 1)
325
+ end
326
+ when /\A\./
327
+ if start_number?(index)
328
+ consume_numeric(index)
329
+ else
330
+ State.new(DelimToken.new(value: ".", location: index...(index + 1)), index + 1)
331
+ end
332
+ when /\A:/
333
+ State.new(ColonToken.new(location: index...(index + 1)), index + 1)
334
+ when /\A;/
335
+ State.new(SemicolonToken.new(location: index...(index + 1)), index + 1)
336
+ when /\A</
337
+ if source[index...(index + 4)] == "<!--"
338
+ State.new(CDOToken.new(location: index...(index + 4)), index + 4)
339
+ else
340
+ State.new(DelimToken.new(value: "<", location: index...(index + 1)), index + 1)
341
+ end
342
+ when /\A@/
343
+ if start_ident_sequence?(index + 1)
344
+ state = consume_ident_sequence(index + 1)
345
+ State.new(AtKeywordToken.new(value: state.value, location: index...state.index), state.index)
346
+ else
347
+ State.new(DelimToken.new(value: "@", location: index...(index + 1)), index + 1)
348
+ end
349
+ when /\A\[/
350
+ State.new(OpenSquareToken.new(location: index...(index + 1)), index + 1)
351
+ when %r{\A\\}
352
+ if valid_escape?(source[index], source[index + 1])
353
+ consume_ident_like(index)
354
+ else
355
+ errors << ParseError.new("invalid escape at #{index}")
356
+ State.new(DelimToken.new(value: "\\", location: index...(index + 1)), index + 1)
357
+ end
358
+ when /\A\]/
359
+ State.new(CloseSquareToken.new(location: index...(index + 1)), index + 1)
360
+ when /\A\{/
361
+ State.new(OpenCurlyToken.new(location: index...(index + 1)), index + 1)
362
+ when /\A\}/
363
+ State.new(CloseCurlyToken.new(location: index...(index + 1)), index + 1)
364
+ when /\A#{DIGIT}/o
365
+ consume_numeric(index)
366
+ when /\A#{IDENT_START}/o
367
+ consume_ident_like(index)
368
+ when "", nil
369
+ State.new(EOFToken[index], index)
370
+ else
371
+ State.new(DelimToken.new(value: source[index], location: index...(index + 1)), index + 1)
372
+ end
373
+ end
374
+
375
+ # 4.3.2. Consume comments
376
+ # https://www.w3.org/TR/css-syntax-3/#consume-comments
377
+ def consume_comment(index)
378
+ ending = source.index("*/", index + 2)
379
+
380
+ if ending.nil?
381
+ errors << ParseError.new("unterminated comment starting at #{index}")
382
+ location = index...source.length
383
+ State.new(CommentToken.new(value: source[location], location: location), source.length)
384
+ else
385
+ location = index...(ending + 2)
386
+ State.new(CommentToken.new(value: source[location], location: location), ending + 2)
387
+ end
388
+ end
389
+
390
+ # 4.3.3. Consume a numeric token
391
+ # https://www.w3.org/TR/css-syntax-3/#consume-numeric-token
392
+ def consume_numeric(index)
393
+ start = index
394
+ state = consume_number(index)
395
+
396
+ value, type = state.value
397
+ index = state.index
398
+
399
+ if start_ident_sequence?(index)
400
+ state = consume_ident_sequence(index)
401
+ State.new(DimensionToken.new(value: value, unit: state.value, type: type, location: start...index), state.index)
402
+ elsif source[index] == "%"
403
+ index += 1
404
+ State.new(PercentageToken.new(value: value, type: type, location: start...index), index)
405
+ else
406
+ State.new(NumberToken.new(value: value, type: type, location: start...index), index)
407
+ end
408
+ end
409
+
410
+ # 4.3.4. Consume an ident-like token
411
+ # https://www.w3.org/TR/css-syntax-3/#consume-ident-like-token
412
+ def consume_ident_like(index)
413
+ start = index
414
+ state = consume_ident_sequence(index)
415
+
416
+ index = state.index
417
+ string = state.value
418
+
419
+ if (string.casecmp("url") == 0) && (source[index] == "(")
420
+ index += 1 # (
421
+
422
+ # While the next two input code points are whitespace, consume the
423
+ # next input code point.
424
+ while whitespace?(source[index]) && whitespace?(source[index + 1])
425
+ index += 1
426
+ end
427
+
428
+ if /["']/.match?(source[index]) || (whitespace?(source[index]) && /["']/.match?(source[index + 1]))
429
+ State.new(FunctionToken.new(value: string, location: start...index), index)
430
+ else
431
+ consume_url(start)
432
+ end
433
+ elsif source[index] == "("
434
+ index += 1
435
+ State.new(FunctionToken.new(value: string, location: start...index), index)
436
+ elsif (string.casecmp("u") == 0) && (state = consume_urange(index - 1))
437
+ state
438
+ else
439
+ State.new(IdentToken.new(value: string, location: start...index), index)
440
+ end
441
+ end
442
+
443
+ # 4.3.5. Consume a string token
444
+ # https://www.w3.org/TR/css-syntax-3/#consume-string-token
445
+ def consume_string(index, quote)
446
+ start = index
447
+ index += 1
448
+ value = +""
449
+
450
+ while index <= source.length
451
+ case source[index]
452
+ when quote
453
+ return State.new(StringToken.new(value: value, location: start...(index + 1)), index + 1)
454
+ when nil
455
+ errors << ParseError.new("unterminated string at #{start}")
456
+ return State.new(StringToken.new(value: value, location: start...index), index)
457
+ when "\n"
458
+ errors << ParseError.new("newline in string at #{index}")
459
+ return State.new(BadStringToken.new(value: value, location: start...index), index)
460
+ when "\\"
461
+ index += 1
462
+
463
+ if index == source.length
464
+ next
465
+ elsif source[index] == "\n"
466
+ value << source[index]
467
+ index += 1
468
+ else
469
+ state = consume_escaped_code_point(index)
470
+ value << state.value
471
+ index = state.index
472
+ end
473
+ else
474
+ value << source[index]
475
+ index += 1
476
+ end
477
+ end
478
+ end
479
+
480
+ # 4.3.6. Consume a url token
481
+ # https://www.w3.org/TR/css-syntax-3/#consume-url-token
482
+ def consume_url(index)
483
+ # 1.
484
+ value = +""
485
+
486
+ # 2.
487
+ start = index
488
+ index += 4 # url(
489
+ index += 1 while whitespace?(source[index])
490
+
491
+ # 3.
492
+ while index <= source.length
493
+ case source[index..]
494
+ when /\A\)/
495
+ return State.new(URLToken.new(value: value, location: start...(index + 1)), index + 1)
496
+ when "", nil
497
+ errors << ParseError.new("unterminated url at #{start}")
498
+ return State.new(URLToken.new(value: value, location: start...index), index)
499
+ when /\A#{WHITESPACE}+/o
500
+ index += $&.length
501
+
502
+ case source[index]
503
+ when ")"
504
+ return State.new(URLToken.new(value: value, location: start...(index + 1)), index + 1)
505
+ when nil
506
+ errors << ParseError.new("unterminated url at #{start}")
507
+ return State.new(URLToken.new(value: value, location: start...index), index)
508
+ else
509
+ errors << ParseError.new("invalid url at #{start}")
510
+ state = consume_bad_url_remnants(index)
511
+ return State.new(BadURLToken.new(value: value + state.value, location: start...state.index), state.index)
512
+ end
513
+ when /\A["'(]|#{NON_PRINTABLE}/o
514
+ errors << ParseError.new("invalid character in url at #{index}")
515
+ state = consume_bad_url_remnants(index)
516
+ return State.new(BadURLToken.new(value: value + state.value, location: start...state.index), state.index)
517
+ when %r{\A\\}
518
+ if valid_escape?(source[index], source[index + 1])
519
+ state = consume_escaped_code_point(index + 1)
520
+ value << state.value
521
+ index = state.index
522
+ else
523
+ errors << ParseError.new("invalid escape at #{index}")
524
+ state = consume_bad_url_remnants(index)
525
+ return State.new(BadURLToken.new(value: value + state.value, location: start...state.index), state.index)
526
+ end
527
+ else
528
+ value << source[index]
529
+ index += 1
530
+ end
531
+ end
532
+ end
533
+
534
+ # 4.3.7. Consume an escaped code point
535
+ # https://www.w3.org/TR/css-syntax-3/#consume-escaped-code-point
536
+ def consume_escaped_code_point(index)
537
+ replacement = "\u{FFFD}"
538
+
539
+ if /\A(\h{1,6})#{WHITESPACE}?/o =~ source[index..]
540
+ ord = $1.to_i(16)
541
+
542
+ if ord == 0 || (0xD800..0xDFFF).cover?(ord) || ord > 0x10FFFF
543
+ State.new(replacement, index + $&.length)
544
+ else
545
+ State.new(ord.chr(Encoding::UTF_8), index + $&.length)
546
+ end
547
+ elsif index == source.length
548
+ State.new(replacement, index)
549
+ else
550
+ State.new(source[index], index + 1)
551
+ end
552
+ end
553
+
554
+ # 4.3.8. Check if two code points are a valid escape
555
+ # https://www.w3.org/TR/css-syntax-3/#starts-with-a-valid-escape
556
+ def valid_escape?(left, right)
557
+ (left == "\\") && (right != "\n")
558
+ end
559
+
560
+ # 4.3.9. Check if three code points would start an ident sequence
561
+ # https://www.w3.org/TR/css-syntax-3/#would-start-an-identifier
562
+ def start_ident_sequence?(index)
563
+ first, second, third = source[index...(index + 3)].chars
564
+
565
+ case first
566
+ when "-"
567
+ (/#{IDENT_START}/o.match?(second) || (second == "-")) ||
568
+ valid_escape?(second, third)
569
+ when /#{IDENT_START}/o
570
+ true
571
+ when "\\"
572
+ valid_escape?(first, second)
573
+ else
574
+ false
575
+ end
576
+ end
577
+
578
+ # 4.3.10. Check if three code points would start a number
579
+ # https://www.w3.org/TR/css-syntax-3/#starts-with-a-number
580
+ def start_number?(index)
581
+ first, second, third = source[index...(index + 3)].chars
582
+
583
+ case first
584
+ when "+", "-"
585
+ digit?(second) || (second == "." && digit?(third))
586
+ when "."
587
+ digit?(second)
588
+ when /#{DIGIT}/o
589
+ true
590
+ else
591
+ false
592
+ end
593
+ end
594
+
595
+ # 4.3.11. Consume an ident sequence
596
+ # https://www.w3.org/TR/css-syntax-3/#consume-an-ident-sequence
597
+ def consume_ident_sequence(index)
598
+ result = +""
599
+
600
+ while index <= source.length
601
+ if ident?(source[index])
602
+ result << source[index]
603
+ index += 1
604
+ elsif valid_escape?(source[index], source[index + 1])
605
+ state = consume_escaped_code_point(index + 1)
606
+ result << state.value
607
+ index = state.index
608
+ else
609
+ return State.new(result, index)
610
+ end
611
+ end
612
+ end
613
+
614
+ # 4.3.12. Consume a number
615
+ # https://www.w3.org/TR/css-syntax-3/#consume-a-number
616
+ def consume_number(index)
617
+ # 1.
618
+ repr = +""
619
+ type = "integer"
620
+
621
+ # 2.
622
+ if /[+-]/.match?(source[index])
623
+ repr << source[index]
624
+ index += 1
625
+ end
626
+
627
+ # 3.
628
+ while digit?(source[index])
629
+ repr << source[index]
630
+ index += 1
631
+ end
632
+
633
+ # 4.
634
+ if source[index] == "." && digit?(source[index + 1])
635
+ repr += source[index..(index + 1)]
636
+ index += 2
637
+ type = "number"
638
+
639
+ while digit?(source[index])
640
+ repr << source[index]
641
+ index += 1
642
+ end
643
+ end
644
+
645
+ # 5.
646
+ if /\A[Ee][+-]?#{DIGIT}+/o =~ source[index..]
647
+ repr += $&
648
+ index += $&.length
649
+ type = "number"
650
+ end
651
+
652
+ # 6., 7.
653
+ State.new([convert_to_number(repr), type], index)
654
+ end
655
+
656
+ # 4.3.13. Convert a string to a number
657
+ # https://www.w3.org/TR/css-syntax-3/#convert-a-string-to-a-number
658
+ def convert_to_number(value)
659
+ pattern = %r{
660
+ \A
661
+ (?<sign>[+-]?)
662
+ (?<integer>#{DIGIT}*)
663
+ (?<decimal>\.?)
664
+ (?<fractional>#{DIGIT}*)
665
+ (?<exponent_indicator>[Ee]?)
666
+ (?<exponent_sign>[+-]?)
667
+ (?<exponent>#{DIGIT}*)
668
+ \z
669
+ }ox
670
+
671
+ if (match = pattern.match(value))
672
+ s = match[:sign] == "-" ? -1 : 1
673
+ i = match[:integer].to_i
674
+ f = 0
675
+ d = 0
676
+
677
+ unless match[:fractional].empty?
678
+ f = match[:fractional].to_i
679
+ d = match[:fractional].length
680
+ end
681
+
682
+ t = match[:exponent_sign] == "-" ? -1 : 1
683
+ e = match[:exponent].to_i
684
+
685
+ s * (i + f * 10**(-d)) * 10**(t * e)
686
+ else
687
+ raise ParseError, "convert_to_number called with invalid value: #{value}"
688
+ end
689
+ end
690
+
691
+ # 4.3.14. Consume the remnants of a bad url
692
+ # https://www.w3.org/TR/css-syntax-3/#consume-remnants-of-bad-url
693
+ def consume_bad_url_remnants(index)
694
+ value = +""
695
+
696
+ while index <= source.length
697
+ case source[index..]
698
+ when "", nil
699
+ return State.new(value, index)
700
+ when /\A\)/
701
+ value << ")"
702
+ return State.new(value, index + 1)
703
+ else
704
+ if valid_escape?(source[index], source[index + 1])
705
+ state = consume_escaped_code_point(index)
706
+ value << state.value
707
+ index = state.index
708
+ else
709
+ value << source[index]
710
+ index += 1
711
+ end
712
+ end
713
+ end
714
+ end
715
+
716
+ # https://www.w3.org/TR/css-syntax-3/#digit
717
+ def digit?(value)
718
+ /#{DIGIT}/o.match?(value)
719
+ end
720
+
721
+ # https://www.w3.org/TR/css-syntax-3/#ident-code-point
722
+ def ident?(value)
723
+ /#{IDENT}/o.match?(value)
724
+ end
725
+
726
+ # https://www.w3.org/TR/css-syntax-3/#whitespace
727
+ def whitespace?(value)
728
+ /#{WHITESPACE}/o.match?(value)
729
+ end
730
+
731
+ #-------------------------------------------------------------------------
732
+ # 5. Parsing
733
+ # https://www.w3.org/TR/css-syntax-3/#parsing
734
+ #-------------------------------------------------------------------------
735
+
736
+ # 5.4.1. Consume a list of rules
737
+ # https://www.w3.org/TR/css-syntax-3/#consume-list-of-rules
738
+ def consume_rule_list(tokens, top_level: true)
739
+ rules = []
740
+
741
+ loop do
742
+ case tokens.peek
743
+ in CommentToken | WhitespaceToken
744
+ tokens.next
745
+ in EOFToken
746
+ return rules
747
+ in CDCToken | CDOToken
748
+ if top_level
749
+ tokens.next
750
+ else
751
+ rule = consume_qualified_rule(tokens)
752
+ rules << rule if rule
753
+ end
754
+ in AtKeywordToken
755
+ rules << consume_at_rule(tokens)
756
+ else
757
+ rule = consume_qualified_rule(tokens)
758
+ rules << rule if rule
759
+ end
760
+ end
761
+ end
762
+
763
+ # 5.4.2. Consume an at-rule
764
+ # https://www.w3.org/TR/css-syntax-3/#consume-at-rule
765
+ def consume_at_rule(tokens)
766
+ name_token = tokens.next
767
+ prelude = []
768
+ block = nil
769
+
770
+ loop do
771
+ case tokens.peek
772
+ in SemicolonToken[location:]
773
+ tokens.next
774
+ return AtRule.new(name: name_token.value, prelude: prelude, block: block, location: name_token.location.to(location))
775
+ in EOFToken[location:]
776
+ errors << ParseError.new("Unexpected EOF while parsing at-rule")
777
+ return AtRule.new(name: name_token.value, prelude: prelude, block: block, location: name_token.location.to(location))
778
+ in OpenCurlyToken
779
+ block = consume_simple_block(tokens)
780
+ return AtRule.new(name: name_token.value, prelude: prelude, block: block, location: name_token.location.to(block.location))
781
+ else
782
+ prelude << consume_component_value(tokens)
783
+ end
784
+ end
785
+ end
786
+
787
+ # 5.4.3. Consume a qualified rule
788
+ # https://www.w3.org/TR/css-syntax-3/#consume-qualified-rule
789
+ def consume_qualified_rule(tokens)
790
+ prelude = []
791
+ block = nil
792
+
793
+ loop do
794
+ case tokens.peek
795
+ in EOFToken
796
+ errors << ParseError.new("Unexpected EOF while parsing qualified rule")
797
+ return nil
798
+ in OpenCurlyToken
799
+ block = consume_simple_block(tokens)
800
+ location = prelude.any? ? prelude.first.location.to(block.location) : block.location
801
+ return QualifiedRule.new(prelude: prelude, block: block, location: location)
802
+ else
803
+ prelude << consume_component_value(tokens)
804
+ end
805
+ end
806
+ end
807
+
808
+ # 5.4.4. Consume a style block’s contents
809
+ # https://www.w3.org/TR/css-syntax-3/#consume-style-block
810
+ def consume_style_block_contents(tokens)
811
+ declarations = []
812
+ rules = []
813
+
814
+ loop do
815
+ case tokens.peek
816
+ in SemicolonToken | WhitespaceToken
817
+ tokens.next
818
+ in EOFToken
819
+ tokens.next
820
+ return declarations + rules
821
+ in AtKeywordToken
822
+ rules << consume_at_rule(tokens)
823
+ in IdentToken
824
+ list = [tokens.next]
825
+
826
+ loop do
827
+ case tokens.peek
828
+ in EOFToken
829
+ list << tokens.next
830
+ break
831
+ in SemicolonToken
832
+ list << tokens.next
833
+ list << EOFToken[list.last.location.end_char]
834
+ break
835
+ else
836
+ list << consume_component_value(tokens)
837
+ end
838
+ end
839
+
840
+ declaration = consume_declaration(list.to_enum)
841
+ declarations << declaration if declaration
842
+ in DelimToken[value: "&"]
843
+ rule = consume_qualified_rule(tokens)
844
+ rules << rule if rule
845
+ in { location: }
846
+ errors << ParseError.new("Unexpected token while parsing style block at #{location.start_char}")
847
+
848
+ until %i[semicolon EOF].include?(tokens.peek.type)
849
+ consume_component_value(tokens)
850
+ end
851
+ end
852
+ end
853
+ end
854
+
855
+ # 5.4.5. Consume a list of declarations
856
+ # https://www.w3.org/TR/css-syntax-3/#consume-list-of-declarations
857
+ def consume_declaration_list(tokens)
858
+ declarations = []
859
+
860
+ loop do
861
+ case tokens.peek
862
+ in SemicolonToken | WhitespaceToken
863
+ tokens.next
864
+ in EOFToken
865
+ tokens.next
866
+ return declarations
867
+ in AtKeywordToken
868
+ declarations << consume_at_rule(tokens)
869
+ in IdentToken
870
+ list = [tokens.next]
871
+
872
+ loop do
873
+ case tokens.peek
874
+ in EOFToken | SemicolonToken
875
+ break
876
+ else
877
+ list << consume_component_value(tokens)
878
+ end
879
+ end
880
+
881
+ if tokens.peek.is_a?(EOFToken)
882
+ list << tokens.next
883
+
884
+ declaration = consume_declaration(list.to_enum)
885
+ declarations << declaration if declaration
886
+
887
+ return declarations
888
+ else
889
+ tokens.next
890
+ list << EOFToken[list.last.location.end_char]
891
+
892
+ declaration = consume_declaration(list.to_enum)
893
+ declarations << declaration if declaration
894
+ end
895
+ else
896
+ errors << ParseError.new("Unexpected token while parsing declaration list at #{tokens.peek.location.start_char}")
897
+
898
+ loop do
899
+ case tokens.peek
900
+ in EOFToken | SemicolonToken
901
+ break
902
+ else
903
+ consume_component_value(tokens)
904
+ end
905
+ end
906
+ end
907
+ end
908
+ end
909
+
910
+ # 5.4.6. Consume a declaration
911
+ # https://www.w3.org/TR/css-syntax-3/#consume-declaration
912
+ def consume_declaration(tokens)
913
+ name = tokens.next
914
+ value = []
915
+ important = false
916
+
917
+ # 1.
918
+ loop do
919
+ case tokens.peek
920
+ in CommentToken | WhitespaceToken
921
+ tokens.next
922
+ else
923
+ break
924
+ end
925
+ end
926
+
927
+ # 2.
928
+ case tokens.peek
929
+ in ColonToken
930
+ tokens.next
931
+ else
932
+ errors << ParseError.new("Expected colon at #{tokens.peek.location.start_char}")
933
+ return
934
+ end
935
+
936
+ # 3.
937
+ loop do
938
+ case tokens.peek
939
+ in CommentToken | WhitespaceToken
940
+ tokens.next
941
+ else
942
+ break
943
+ end
944
+ end
945
+
946
+ # 4.
947
+ value << consume_component_value(tokens) until tokens.peek.is_a?(EOFToken)
948
+
949
+ # 5.
950
+ case value.reject { |token| token.is_a?(WhitespaceToken) || token.is_a?(CommentToken) }[-2..]
951
+ in [DelimToken[value: "!"] => first, IdentToken[value: /\Aimportant\z/i] => second]
952
+ value.delete(first)
953
+ value.delete(second)
954
+ important = true
955
+ else
956
+ end
957
+
958
+ # 6.
959
+ loop do
960
+ case value[-1]
961
+ in CommentToken | WhitespaceToken
962
+ value.pop
963
+ else
964
+ break
965
+ end
966
+ end
967
+
968
+ # 7.
969
+ location = name.location
970
+ location = location.to(value.last.location) if value.any?
971
+ Declaration.new(name: name.value, value: value, important: important, location: location)
972
+ end
973
+
974
+ # 5.4.7. Consume a component value
975
+ # https://www.w3.org/TR/css-syntax-3/#consume-component-value
976
+ def consume_component_value(tokens)
977
+ case tokens.peek
978
+ in OpenCurlyToken | OpenSquareToken | OpenParenToken
979
+ consume_simple_block(tokens)
980
+ in FunctionToken
981
+ consume_function(tokens)
982
+ else
983
+ tokens.next
984
+ end
985
+ end
986
+
987
+ # 5.4.8. Consume a simple block
988
+ # https://www.w3.org/TR/css-syntax-3/#consume-simple-block
989
+ def consume_simple_block(tokens)
990
+ token = tokens.next
991
+ ending = {
992
+ OpenParenToken => CloseParenToken,
993
+ OpenSquareToken => CloseSquareToken,
994
+ OpenCurlyToken => CloseCurlyToken
995
+ }[token.class]
996
+
997
+ value = []
998
+
999
+ loop do
1000
+ case tokens.peek
1001
+ when ending
1002
+ location = token.location.to(tokens.next.location)
1003
+ return SimpleBlock.new(token: token.value, value: value, location: location)
1004
+ when EOFToken
1005
+ errors << ParseError.new("Unexpected EOF while parsing simple block at #{token.location.start_char}")
1006
+ return SimpleBlock.new(token: token.value, value: value, location: token.location.to(tokens.peek.location))
1007
+ else
1008
+ value << consume_component_value(tokens)
1009
+ end
1010
+ end
1011
+ end
1012
+
1013
+ # 5.4.9. Consume a function
1014
+ # https://www.w3.org/TR/css-syntax-3/#consume-function
1015
+ def consume_function(tokens)
1016
+ name_token = tokens.next
1017
+ value = []
1018
+
1019
+ loop do
1020
+ case tokens.peek
1021
+ in CloseParenToken[location:]
1022
+ tokens.next
1023
+ return Function.new(name: name_token.value, value: value, location: name_token.location.to(location))
1024
+ in EOFToken[location:]
1025
+ errors << ParseError.new("Unexpected EOF while parsing function at #{name_token.location.start_char}")
1026
+ return Function.new(name: name_token.value, value: value, location: name_token.location.to(location))
1027
+ else
1028
+ value << consume_component_value(tokens)
1029
+ end
1030
+ end
1031
+ end
1032
+
1033
+ #-------------------------------------------------------------------------
1034
+ # 7. The Unicode-Range microsyntax
1035
+ # https://www.w3.org/TR/css-syntax-3/#urange
1036
+ #-------------------------------------------------------------------------
1037
+
1038
+ # 7.1. The <urange> type
1039
+ # https://www.w3.org/TR/css-syntax-3/#urange-syntax
1040
+ def consume_urange(index)
1041
+ start = index
1042
+ index += 1 # to move past the "u"
1043
+
1044
+ # At this point we've already consumed the "u". We need to gather up a
1045
+ # couple of component values to see if it matches the grammar first,
1046
+ # before we concatenate all of the representations together.
1047
+ #
1048
+ # To do this, we're going to build a little state machine. It's going to
1049
+ # walk through with each input. If we receive an input for which there
1050
+ # isn't a transition from the current state and the current state is not
1051
+ # a final state, then we exit. Otherwise if it is a final state, we
1052
+ # attempt to parse a urange token from the concatenation of the values
1053
+ # of the tokens.
1054
+ #
1055
+ # ┌───┐ ┌───┐ ── ? ──────> ┌───┐ ──┐
1056
+ # ──> │ 1 │ ── + ──> │ 2 │ ── ident ──> │|3|│ ?
1057
+ # └───┘ └───┘ ┌───> └───┘ <─┘
1058
+ # ││ │
1059
+ # │└─── dimension ───────────┘
1060
+ # └──── number ─────> ┌───┐ ┌───┐ ──┐
1061
+ # ┌─── dimension ─── │|4|│ ── ? ──> │|5|│ ?
1062
+ # │ ┌── number ─ └───┘ └───┘ <─┘
1063
+ # V V
1064
+ # ┌───┐ ┌───┐
1065
+ # │|6|│ │|7|│
1066
+ # └───┘ └───┘
1067
+ #
1068
+ tokens = []
1069
+ box = 1
1070
+
1071
+ loop do
1072
+ state = consume_token(index)
1073
+ box =
1074
+ case [box, state.value]
1075
+ in [1, DelimToken[value: "+"]] then 2
1076
+ in [1, DimensionToken] then 3
1077
+ in [1, NumberToken] then 4
1078
+ in [2, DelimToken[value: "?"]] then 3
1079
+ in [2, IdentToken] then 3
1080
+ in [3, DelimToken[value: "?"]] then 3
1081
+ in [4, DelimToken[value: "?"]] then 5
1082
+ in [4, DimensionToken] then 6
1083
+ in [4, NumberToken] then 7
1084
+ in [5, DelimToken[value: "?"]] then 5
1085
+ else
1086
+ if [3, 4, 5, 6, 7].include?(box)
1087
+ break # final states
1088
+ else
1089
+ return
1090
+ end
1091
+ end
1092
+
1093
+ tokens << state.value
1094
+ index = state.index
1095
+ end
1096
+
1097
+ # 2.
1098
+ text = "u" + tokens.map { |token| source[token.location.to_range] }.join
1099
+ return if text[1] != "+"
1100
+ index = 2
1101
+
1102
+ # 3.
1103
+ match = text[index..].match(/\A\h*\?*/)
1104
+ return unless match
1105
+
1106
+ value = match[0]
1107
+ return unless (1..6).cover?(value.length)
1108
+
1109
+ index += value.length
1110
+ start_value, end_value =
1111
+ if value.end_with?("?")
1112
+ return if index != text.length
1113
+ [value.gsub("?", "0").hex, value.gsub("?", "F").hex]
1114
+ else
1115
+ [value.hex, value.hex]
1116
+ end
1117
+
1118
+ # 4.
1119
+ if index == text.length
1120
+ return unless valid_urange?(start_value, end_value)
1121
+
1122
+ ending = start + text.length
1123
+ return State.new(URange.new(start_value: start_value, end_value: end_value, location: start...ending), ending)
1124
+ end
1125
+
1126
+ # 5.
1127
+ return if text[index] != "-"
1128
+ index += 1
1129
+
1130
+ # 6.
1131
+ match = text[index..].match(/\A\h*/)
1132
+ return if !match || match[0].length > 6
1133
+
1134
+ end_value = match[0].hex
1135
+ index += match[0].length
1136
+ return if index != text.length
1137
+
1138
+ # 7.
1139
+ return unless valid_urange?(start_value, end_value)
1140
+
1141
+ ending = start + text.length
1142
+ State.new(URange.new(start_value: start_value, end_value: end_value, location: start...ending), ending)
1143
+ end
1144
+
1145
+ # Checks that the start and end value of a urange are valid.
1146
+ def valid_urange?(start_value, end_value)
1147
+ if end_value > 0x10FFFF
1148
+ errors << ParseError.new("Invalid urange. #{end_value} greater than 0x10FFFF")
1149
+ false
1150
+ elsif start_value > end_value
1151
+ errors << ParseError.new("Invalid urange. #{start_value} greater than #{end_value}")
1152
+ false
1153
+ else
1154
+ true
1155
+ end
1156
+ end
1157
+
1158
+ #-------------------------------------------------------------------------
1159
+ # 9. CSS stylesheets
1160
+ # https://www.w3.org/TR/css-syntax-3/#css-stylesheets
1161
+ #-------------------------------------------------------------------------
1162
+
1163
+ # https://www.w3.org/TR/css-syntax-3/#parse-a-css-stylesheet
1164
+ def parse_css_stylesheet
1165
+ stylesheet = parse_stylesheet
1166
+ rules =
1167
+ stylesheet.rules.map do |rule|
1168
+ rule.is_a?(QualifiedRule) ? create_style_rule(rule) : rule
1169
+ end
1170
+
1171
+ CSSStyleSheet.new(rules: rules, location: stylesheet.location)
1172
+ end
1173
+
1174
+ # 9.1. Style rules
1175
+ # https://www.w3.org/TR/css-syntax-3/#style-rules
1176
+ def create_style_rule(rule)
1177
+ slct_tokens = [*rule.prelude, EOFToken[rule.location.end_char]]
1178
+ decl_tokens = [*rule.block.value, EOFToken[rule.location.end_char]]
1179
+
1180
+ StyleRule.new(
1181
+ selectors: Selectors.new(slct_tokens).parse,
1182
+ declarations: consume_style_block_contents(decl_tokens.to_enum),
1183
+ location: rule.location
1184
+ )
1185
+ end
1186
+ end
1187
+ end
1188
+ end