p_css 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,441 @@
1
+ module CSS
2
+ # Tokenizer based on CSS Syntax Module Level 3/4 §4.
3
+ # https://www.w3.org/TR/css-syntax-3/#tokenization
4
+ class Tokenizer
5
+ include CodePoints
6
+
7
+ PUNCTUATION = {
8
+ '(' => :lparen,
9
+ ')' => :rparen,
10
+ ',' => :comma,
11
+ ':' => :colon,
12
+ ';' => :semicolon,
13
+ '[' => :lbracket,
14
+ ']' => :rbracket,
15
+ '{' => :lbrace,
16
+ '}' => :rbrace
17
+ }.freeze
18
+
19
+ # CR / FF (and CR LF) collapse to LF; NUL collapses to U+FFFD. Done in
20
+ # one pass.
21
+ PREPROCESS_RE = /\r\n?|\f|\0/.freeze
22
+
23
+ def initialize(input, preserve_comments: false)
24
+ @input = preprocess(input)
25
+ @pos = 0
26
+ @newlines = collect_newline_offsets(@input)
27
+ @preserve_comments = preserve_comments
28
+ end
29
+
30
+ def tokenize
31
+ tokens = []
32
+
33
+ loop do
34
+ token = next_token
35
+ break if token.type == :eof
36
+
37
+ tokens << token
38
+ end
39
+
40
+ tokens
41
+ end
42
+
43
+ def next_token
44
+ consume_comments unless @preserve_comments
45
+
46
+ return Token.new(:eof) if @pos >= @input.length
47
+
48
+ start_offset = @pos
49
+ tok = consume_one_token
50
+ line, column = line_column_at(start_offset)
51
+
52
+ tok.assign_position!(Position.new(line:, column:, offset: start_offset, end_offset: @pos))
53
+ end
54
+
55
+ private
56
+
57
+ def consume_one_token
58
+ return consume_comment_token if peek == '/' && peek(1) == '*'
59
+
60
+ c = consume
61
+
62
+ return consume_whitespace if whitespace?(c)
63
+ return consume_string_token(c) if c == '"' || c == "'"
64
+
65
+ if (c == '+' || c == '-' || c == '.') && number_starts?(c, peek, peek(1))
66
+ reconsume
67
+ return consume_numeric_token
68
+ end
69
+
70
+ if (type = PUNCTUATION[c])
71
+ return Token.new(type)
72
+ end
73
+
74
+ case c
75
+ when '#'
76
+ if ident_code_point?(peek) || valid_escape?(peek, peek(1))
77
+ flag = ident_sequence_starts?(peek, peek(1), peek(2)) ? :id : :unrestricted
78
+ Token.new(:hash, consume_ident_sequence, flag:)
79
+ else
80
+ Token.new(:delim, c)
81
+ end
82
+ when '+', '.'
83
+ Token.new(:delim, c)
84
+ when '-'
85
+ if peek == '-' && peek(1) == '>'
86
+ consume
87
+ consume
88
+ Token.new(:cdc)
89
+ elsif ident_sequence_starts?(c, peek, peek(1))
90
+ reconsume
91
+ consume_ident_like_token
92
+ else
93
+ Token.new(:delim, c)
94
+ end
95
+ when '<'
96
+ if peek == '!' && peek(1) == '-' && peek(2) == '-'
97
+ consume
98
+ consume
99
+ consume
100
+ Token.new(:cdo)
101
+ else
102
+ Token.new(:delim, c)
103
+ end
104
+ when '@'
105
+ if ident_sequence_starts?(peek, peek(1), peek(2))
106
+ Token.new(:at_keyword, consume_ident_sequence)
107
+ else
108
+ Token.new(:delim, c)
109
+ end
110
+ when '\\'
111
+ if valid_escape?(c, peek)
112
+ reconsume
113
+ consume_ident_like_token
114
+ else
115
+ Token.new(:delim, c)
116
+ end
117
+ when '0'..'9'
118
+ reconsume
119
+ consume_numeric_token
120
+ else
121
+ if ident_start_code_point?(c)
122
+ reconsume
123
+ consume_ident_like_token
124
+ else
125
+ Token.new(:delim, c)
126
+ end
127
+ end
128
+ end
129
+
130
+ def preprocess(input)
131
+ input.encode('UTF-8').gsub(PREPROCESS_RE) {
132
+ $~[0] == "\0" ? CodePoints::REPLACEMENT : "\n"
133
+ }
134
+ end
135
+
136
+ def peek(offset = 0)
137
+ @input[@pos + offset]
138
+ end
139
+
140
+ def consume
141
+ c = @input[@pos]
142
+ return nil if c.nil?
143
+
144
+ @pos += 1
145
+ c
146
+ end
147
+
148
+ def reconsume
149
+ @pos -= 1
150
+ end
151
+
152
+ def collect_newline_offsets(input)
153
+ offsets = []
154
+ i = -1
155
+
156
+ offsets << i while (i = input.index("\n", i + 1))
157
+ offsets
158
+ end
159
+
160
+ # Newline characters themselves are reported as belonging to the line
161
+ # they terminate (col = offset + 1 on line 1, etc).
162
+ def line_column_at(offset)
163
+ idx = @newlines.bsearch_index { it >= offset } || @newlines.size
164
+ prev_nl = idx.zero? ? -1 : @newlines[idx - 1]
165
+
166
+ [idx + 1, offset - prev_nl]
167
+ end
168
+
169
+ def whitespace?(c)
170
+ c == ' ' || c == "\n" || c == "\t"
171
+ end
172
+
173
+ def non_printable?(c)
174
+ return false if c.nil?
175
+
176
+ o = c.ord
177
+ o <= 0x08 || o == 0x0B || (0x0E..0x1F).cover?(o) || o == 0x7F
178
+ end
179
+
180
+ # §4.3.8.
181
+ def valid_escape?(c1, c2)
182
+ c1 == '\\' && c2 != "\n" && !c2.nil?
183
+ end
184
+
185
+ # §4.3.9.
186
+ def ident_sequence_starts?(c1, c2, c3)
187
+ case c1
188
+ when '-'
189
+ ident_start_code_point?(c2) || c2 == '-' || valid_escape?(c2, c3)
190
+ when '\\'
191
+ valid_escape?(c1, c2)
192
+ else
193
+ ident_start_code_point?(c1)
194
+ end
195
+ end
196
+
197
+ # §4.3.10.
198
+ def number_starts?(c1, c2, c3)
199
+ case c1
200
+ when '+', '-'
201
+ digit?(c2) || (c2 == '.' && digit?(c3))
202
+ when '.'
203
+ digit?(c2)
204
+ else
205
+ digit?(c1)
206
+ end
207
+ end
208
+
209
+ # §4.3.2. Skips through `/* ... */` comments without producing tokens.
210
+ def consume_comments
211
+ while peek == '/' && peek(1) == '*'
212
+ consume
213
+ consume
214
+
215
+ until eof?
216
+ if consume == '*' && peek == '/'
217
+ consume
218
+ break
219
+ end
220
+ end
221
+ end
222
+ end
223
+
224
+ # When `preserve_comments` is on, comments are emitted as tokens whose
225
+ # value is the body between `/*` and `*/`.
226
+ def consume_comment_token
227
+ consume
228
+ consume
229
+ buf = +''
230
+
231
+ until eof?
232
+ c = consume
233
+ if c == '*' && peek == '/'
234
+ consume
235
+ break
236
+ end
237
+
238
+ buf << c
239
+ end
240
+
241
+ Token.new(:comment, buf)
242
+ end
243
+
244
+ def eof?
245
+ @pos >= @input.length
246
+ end
247
+
248
+ def consume_whitespace
249
+ consume while whitespace?(peek)
250
+
251
+ Token.new(:whitespace)
252
+ end
253
+
254
+ # §4.3.5.
255
+ def consume_string_token(ending)
256
+ buf = +''
257
+
258
+ loop do
259
+ c = consume
260
+
261
+ case c
262
+ when nil, ending
263
+ return Token.new(:string, buf)
264
+ when "\n"
265
+ reconsume
266
+ return Token.new(:bad_string)
267
+ when '\\'
268
+ n = peek
269
+
270
+ if n.nil?
271
+ next
272
+ elsif n == "\n"
273
+ consume
274
+ else
275
+ buf << consume_escaped_code_point
276
+ end
277
+ else
278
+ buf << c
279
+ end
280
+ end
281
+ end
282
+
283
+ # §4.3.7. Assumes the backslash has already been consumed.
284
+ def consume_escaped_code_point
285
+ c = consume
286
+
287
+ return CodePoints::REPLACEMENT if c.nil?
288
+ return c unless hex_digit?(c)
289
+
290
+ hex = c.dup
291
+ hex << consume while hex.length < 6 && hex_digit?(peek)
292
+ consume if whitespace?(peek)
293
+
294
+ n = hex.to_i(16)
295
+
296
+ if n.zero? || (0xD800..0xDFFF).cover?(n) || n > 0x10FFFF
297
+ CodePoints::REPLACEMENT
298
+ else
299
+ [n].pack('U')
300
+ end
301
+ end
302
+
303
+ # §4.3.11.
304
+ def consume_ident_sequence
305
+ buf = +''
306
+
307
+ loop do
308
+ c = consume
309
+
310
+ if ident_code_point?(c)
311
+ buf << c
312
+ elsif valid_escape?(c, peek)
313
+ buf << consume_escaped_code_point
314
+ else
315
+ reconsume unless c.nil?
316
+ return buf
317
+ end
318
+ end
319
+ end
320
+
321
+ # §4.3.4.
322
+ def consume_ident_like_token
323
+ name = consume_ident_sequence
324
+
325
+ if name.casecmp('url').zero? && peek == '('
326
+ consume
327
+
328
+ consume while whitespace?(peek) && whitespace?(peek(1))
329
+
330
+ n1 = peek
331
+ n2 = whitespace?(n1) ? peek(1) : n1
332
+
333
+ if n1 == '"' || n1 == "'" || (whitespace?(n1) && (n2 == '"' || n2 == "'"))
334
+ Token.new(:function, name)
335
+ else
336
+ consume_url_token
337
+ end
338
+ elsif peek == '('
339
+ consume
340
+ Token.new(:function, name)
341
+ else
342
+ Token.new(:ident, name)
343
+ end
344
+ end
345
+
346
+ # §4.3.6. Assumes "url(" has already been consumed.
347
+ def consume_url_token
348
+ buf = +''
349
+
350
+ consume while whitespace?(peek)
351
+
352
+ loop do
353
+ c = consume
354
+
355
+ case c
356
+ when nil, ')'
357
+ return Token.new(:url, buf)
358
+ when '"', "'", '('
359
+ consume_bad_url_remnants
360
+ return Token.new(:bad_url)
361
+ when ' ', "\t", "\n"
362
+ consume while whitespace?(peek)
363
+
364
+ n = peek
365
+
366
+ if n.nil? || n == ')'
367
+ consume unless n.nil?
368
+ return Token.new(:url, buf)
369
+ else
370
+ consume_bad_url_remnants
371
+ return Token.new(:bad_url)
372
+ end
373
+ when '\\'
374
+ if valid_escape?(c, peek)
375
+ buf << consume_escaped_code_point
376
+ else
377
+ consume_bad_url_remnants
378
+ return Token.new(:bad_url)
379
+ end
380
+ else
381
+ if non_printable?(c)
382
+ consume_bad_url_remnants
383
+ return Token.new(:bad_url)
384
+ end
385
+
386
+ buf << c
387
+ end
388
+ end
389
+ end
390
+
391
+ # §4.3.14.
392
+ def consume_bad_url_remnants
393
+ loop do
394
+ c = consume
395
+
396
+ return if c.nil? || c == ')'
397
+
398
+ consume_escaped_code_point if valid_escape?(c, peek)
399
+ end
400
+ end
401
+
402
+ # §4.3.3.
403
+ def consume_numeric_token
404
+ number, flag = consume_number
405
+
406
+ if ident_sequence_starts?(peek, peek(1), peek(2))
407
+ Token.new(:dimension, number, flag:, unit: consume_ident_sequence)
408
+ elsif peek == '%'
409
+ consume
410
+ Token.new(:percentage, number)
411
+ else
412
+ Token.new(:number, number, flag:)
413
+ end
414
+ end
415
+
416
+ # §4.3.12. Returns [numeric_value, :integer | :number].
417
+ def consume_number
418
+ repr = +''
419
+ flag = :integer
420
+
421
+ repr << consume if peek == '+' || peek == '-'
422
+ repr << consume while digit?(peek)
423
+
424
+ if peek == '.' && digit?(peek(1))
425
+ repr << consume
426
+ repr << consume while digit?(peek)
427
+ flag = :number
428
+ end
429
+
430
+ if (peek == 'E' || peek == 'e') &&
431
+ (digit?(peek(1)) || ((peek(1) == '+' || peek(1) == '-') && digit?(peek(2))))
432
+ repr << consume
433
+ repr << consume if peek == '+' || peek == '-'
434
+ repr << consume while digit?(peek)
435
+ flag = :number
436
+ end
437
+
438
+ [flag == :integer ? repr.to_i : repr.to_f, flag]
439
+ end
440
+ end
441
+ end
data/lib/css/urange.rb ADDED
@@ -0,0 +1,45 @@
1
+ module CSS
2
+ # Parser for CSS <urange> tokens, e.g. `U+0-7F`, `U+26`, `U+10??`.
3
+ # https://drafts.csswg.org/css-syntax/#urange-syntax
4
+ #
5
+ # Operates on the source string rather than a token stream because the
6
+ # tokenizer destructively normalizes shapes like `U+0` (the `+` is
7
+ # absorbed into a number-token whose sign is lost on serialization).
8
+ # Sticking with the source preserves the exact form.
9
+ module Urange
10
+ URANGE_RE = /\Au\+([0-9a-f?]{1,6})(?:-([0-9a-f]{1,6}))?\z/i.freeze
11
+ WILDCARD_RE = /\A[0-9a-f]*\?+\z/i.freeze
12
+
13
+ MAX_CODEPOINT = 0x10FFFF
14
+
15
+ extend self
16
+
17
+ def parse(input)
18
+ s = input.to_s.strip
19
+ m = URANGE_RE.match(s)
20
+
21
+ raise ParseError, "invalid urange: #{input.inspect}" unless m
22
+
23
+ start_str, end_str = m[1], m[2]
24
+
25
+ first, last =
26
+ if end_str
27
+ raise ParseError, 'wildcards are not allowed in range form' if start_str.include?('?')
28
+
29
+ [start_str.to_i(16), end_str.to_i(16)]
30
+ elsif start_str.include?('?')
31
+ raise ParseError, 'wildcards must be trailing' unless start_str.match?(WILDCARD_RE)
32
+
33
+ [start_str.tr('?', '0').to_i(16), start_str.tr('?', 'f').to_i(16)]
34
+ else
35
+ n = start_str.to_i(16)
36
+ [n, n]
37
+ end
38
+
39
+ raise ParseError, "codepoint out of range: U+#{format('%X', last)}" if last > MAX_CODEPOINT
40
+ raise ParseError, "urange start must be <= end (U+#{format('%X', first)} > U+#{format('%X', last)})" if first > last
41
+
42
+ Nodes::UnicodeRange.new(first:, last:)
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,3 @@
1
+ module CSS
2
+ VERSION = '0.1.0'
3
+ end
data/lib/css.rb ADDED
@@ -0,0 +1,73 @@
1
+ module CSS
2
+ # Bracket information for the three "simple block" pairs. Indexed both by
3
+ # opening token type (for the parser) and by opening character (for the
4
+ # serializer).
5
+ BRACKET_OPEN_CHAR = {lbrace: '{', lbracket: '[', lparen: '('}.freeze
6
+ BRACKET_CLOSE_TYPE = {lbrace: :rbrace, lbracket: :rbracket, lparen: :rparen}.freeze
7
+ BRACKET_PAIRS = {'{' => '}', '[' => ']', '(' => ')'}.freeze
8
+ end
9
+
10
+ require_relative 'css/version'
11
+ require_relative 'css/code_points'
12
+ require_relative 'css/escape'
13
+ require_relative 'css/token'
14
+ require_relative 'css/tokenizer'
15
+ require_relative 'css/token_cursor'
16
+ require_relative 'css/nodes'
17
+ require_relative 'css/parser'
18
+ require_relative 'css/selectors'
19
+ require_relative 'css/media_queries'
20
+ require_relative 'css/serializer'
21
+ require_relative 'css/urange'
22
+ require_relative 'css/nesting'
23
+ require_relative 'css/cascade'
24
+
25
+ module CSS
26
+ class ParseError < StandardError
27
+ attr_reader :position
28
+
29
+ def initialize(message, position: nil)
30
+ super(position ? "#{position}: #{message}" : message)
31
+ @position = position
32
+ end
33
+ end
34
+
35
+ class << self
36
+ def tokenize(input, **opts) = Tokenizer.new(input, **opts).tokenize
37
+ def parse_stylesheet(input, **opts) = Parser.parse_stylesheet(input, **opts)
38
+ def parse_rule(input, **opts) = Parser.parse_rule(input, **opts)
39
+ def parse_declaration(input, **opts) = Parser.parse_declaration(input, **opts)
40
+ def parse_block_contents(input, **opts) = Parser.parse_block_contents(input, **opts)
41
+ def parse_component_value(input, **opts) = Parser.parse_component_value(input, **opts)
42
+ def parse_component_values(input, **opts) = Parser.parse_component_values(input, **opts)
43
+ def parse_comma_separated_values(input, **opts) = Parser.parse_comma_separated_values(input, **opts)
44
+
45
+ def parse_urange(input) = Urange.parse(input)
46
+
47
+ def parse_selector_list(input) = Selectors::Parser.parse_selector_list(input)
48
+ def parse_selector(input) = Selectors::Parser.parse_selector(input)
49
+ def parse_anb(input) = Selectors::AnBParser.parse(input)
50
+
51
+ def specificity(selector) = Selectors::SpecificityCalculator.calculate(selector)
52
+
53
+ def matches?(element, selector) = Selectors::Matcher.matches?(element, selector)
54
+
55
+ def parse_media_query_list(input) = MediaQueries::Parser.parse(input)
56
+
57
+ def media_matches?(query_list, context)
58
+ ql = query_list.is_a?(String) ? MediaQueries::Parser.parse(query_list) : query_list
59
+ ctx = context.is_a?(MediaQueries::Context) ? context : MediaQueries::Context.default(**context.to_h)
60
+ MediaQueries::Evaluator.evaluate(ql, ctx)
61
+ end
62
+
63
+ def cascade(stylesheet, context: MediaQueries::Context.default)
64
+ Cascade.new(stylesheet, context:)
65
+ end
66
+
67
+ def desugar(stylesheet) = Nesting.desugar(stylesheet)
68
+
69
+ def serialize(node) = Serializer.serialize(node)
70
+
71
+ alias parse parse_stylesheet
72
+ end
73
+ end
data/lib/p_css.rb ADDED
@@ -0,0 +1 @@
1
+ require_relative 'css'
metadata ADDED
@@ -0,0 +1,73 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: p_css
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Keita Urashima
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies: []
12
+ description: p_css is a Ruby implementation of the CSS Syntax Level 4 tokenizer and
13
+ parser, including support for CSS nesting.
14
+ email:
15
+ - ursm@ursm.jp
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - LICENSE.txt
21
+ - README.md
22
+ - lib/css.rb
23
+ - lib/css/cascade.rb
24
+ - lib/css/code_points.rb
25
+ - lib/css/escape.rb
26
+ - lib/css/media_queries.rb
27
+ - lib/css/media_queries/context.rb
28
+ - lib/css/media_queries/evaluator.rb
29
+ - lib/css/media_queries/nodes.rb
30
+ - lib/css/media_queries/parser.rb
31
+ - lib/css/nesting.rb
32
+ - lib/css/nodes.rb
33
+ - lib/css/parser.rb
34
+ - lib/css/selectors.rb
35
+ - lib/css/selectors/anb_parser.rb
36
+ - lib/css/selectors/matcher.rb
37
+ - lib/css/selectors/nodes.rb
38
+ - lib/css/selectors/parser.rb
39
+ - lib/css/selectors/serializer.rb
40
+ - lib/css/selectors/specificity.rb
41
+ - lib/css/serializer.rb
42
+ - lib/css/token.rb
43
+ - lib/css/token_cursor.rb
44
+ - lib/css/tokenizer.rb
45
+ - lib/css/urange.rb
46
+ - lib/css/version.rb
47
+ - lib/p_css.rb
48
+ homepage: https://github.com/ursm/p_css
49
+ licenses:
50
+ - MIT
51
+ metadata:
52
+ bug_tracker_uri: https://github.com/ursm/p_css/issues
53
+ changelog_uri: https://github.com/ursm/p_css/releases
54
+ source_code_uri: https://github.com/ursm/p_css
55
+ rubygems_mfa_required: 'true'
56
+ rdoc_options: []
57
+ require_paths:
58
+ - lib
59
+ required_ruby_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: '3.4'
64
+ required_rubygems_version: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ requirements: []
70
+ rubygems_version: 3.6.9
71
+ specification_version: 4
72
+ summary: A CSS Syntax Level 4 parser for Ruby, with nesting support.
73
+ test_files: []