p_css 0.1.9 → 0.2.0.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,596 @@
1
+ // Port of lib/css/tokenizer.rb (CSS Syntax Module Level 3/4 §4).
2
+ // Position tracking is intentionally omitted in this first cut — only
3
+ // type/value/flag/unit parity with the pure-Ruby Token is targeted.
4
+
5
+ const REPLACEMENT: char = '\u{FFFD}';
6
+
7
+ #[derive(Clone, Copy, Debug, PartialEq, Eq)]
8
+ pub enum Kind {
9
+ Ident, Function, AtKeyword, Hash, String_, BadString, Url, BadUrl,
10
+ Delim, Number, Percentage, Dimension, Whitespace, Cdo, Cdc, Comment,
11
+ Colon, Semicolon, Comma,
12
+ LBracket, RBracket, LParen, RParen, LBrace, RBrace,
13
+ }
14
+
15
+ #[derive(Clone, Copy, Debug, PartialEq, Eq)]
16
+ pub enum HashFlag { Id, Unrestricted }
17
+
18
+ #[derive(Clone, Copy, Debug, PartialEq, Eq)]
19
+ pub enum NumberFlag { Integer, Number }
20
+
21
+ #[derive(Clone, Debug)]
22
+ pub enum TokenValue {
23
+ None,
24
+ Str(String),
25
+ Delim(char),
26
+ Int(i64),
27
+ Float(f64),
28
+ }
29
+
30
+ #[derive(Clone, Debug)]
31
+ pub struct Token {
32
+ pub kind: Kind,
33
+ pub value: TokenValue,
34
+ pub number_flag: Option<NumberFlag>,
35
+ pub hash_flag: Option<HashFlag>,
36
+ pub unit: Option<String>,
37
+ }
38
+
39
+ impl Token {
40
+ fn bare(kind: Kind) -> Self {
41
+ Self { kind, value: TokenValue::None, number_flag: None, hash_flag: None, unit: None }
42
+ }
43
+
44
+ fn delim(c: char) -> Self {
45
+ Self { value: TokenValue::Delim(c), ..Self::bare(Kind::Delim) }
46
+ }
47
+
48
+ fn with_str(kind: Kind, s: String) -> Self {
49
+ Self { value: TokenValue::Str(s), ..Self::bare(kind) }
50
+ }
51
+ }
52
+
53
+ pub struct Tokenizer {
54
+ chars: Vec<char>,
55
+ pos: usize,
56
+ preserve_comments: bool,
57
+ }
58
+
59
+ impl Tokenizer {
60
+ pub fn new(input: &str, preserve_comments: bool) -> Self {
61
+ Self {
62
+ chars: preprocess(input),
63
+ pos: 0,
64
+ preserve_comments,
65
+ }
66
+ }
67
+
68
+ pub fn tokenize(&mut self) -> Vec<Token> {
69
+ let mut out = Vec::new();
70
+
71
+ loop {
72
+ if !self.preserve_comments {
73
+ self.consume_comments();
74
+ }
75
+
76
+ if self.eof() {
77
+ break;
78
+ }
79
+
80
+ out.push(self.consume_one_token());
81
+ }
82
+
83
+ out
84
+ }
85
+
86
+ // --- cursor primitives -----------------------------------------
87
+
88
+ fn peek(&self, offset: usize) -> Option<char> {
89
+ self.chars.get(self.pos + offset).copied()
90
+ }
91
+
92
+ fn consume(&mut self) -> Option<char> {
93
+ let c = self.chars.get(self.pos).copied();
94
+
95
+ if c.is_some() {
96
+ self.pos += 1;
97
+ }
98
+
99
+ c
100
+ }
101
+
102
+ fn reconsume(&mut self) {
103
+ self.pos -= 1;
104
+ }
105
+
106
+ fn eof(&self) -> bool {
107
+ self.pos >= self.chars.len()
108
+ }
109
+
110
+ // --- main dispatch ---------------------------------------------
111
+
112
+ fn consume_one_token(&mut self) -> Token {
113
+ if self.peek(0) == Some('/') && self.peek(1) == Some('*') {
114
+ return self.consume_comment_token();
115
+ }
116
+
117
+ let c = self.consume().expect("eof handled by caller");
118
+
119
+ if is_whitespace(c) {
120
+ return self.consume_whitespace();
121
+ }
122
+
123
+ if c == '"' || c == '\'' {
124
+ return self.consume_string_token(c);
125
+ }
126
+
127
+ if (c == '+' || c == '-' || c == '.') && number_starts(Some(c), self.peek(0), self.peek(1)) {
128
+ self.reconsume();
129
+ return self.consume_numeric_token();
130
+ }
131
+
132
+ if let Some(kind) = punctuation_kind(c) {
133
+ return Token::bare(kind);
134
+ }
135
+
136
+ match c {
137
+ '#' => {
138
+ if is_ident_code_point(self.peek(0)) || valid_escape(self.peek(0), self.peek(1)) {
139
+ let flag = if ident_sequence_starts(self.peek(0), self.peek(1), self.peek(2)) {
140
+ HashFlag::Id
141
+ } else {
142
+ HashFlag::Unrestricted
143
+ };
144
+
145
+ let name = self.consume_ident_sequence();
146
+
147
+ Token {
148
+ hash_flag: Some(flag),
149
+ ..Token::with_str(Kind::Hash, name)
150
+ }
151
+ } else {
152
+ Token::delim(c)
153
+ }
154
+ }
155
+ '+' | '.' => Token::delim(c),
156
+ '-' => {
157
+ if self.peek(0) == Some('-') && self.peek(1) == Some('>') {
158
+ self.consume();
159
+ self.consume();
160
+ Token::bare(Kind::Cdc)
161
+ } else if ident_sequence_starts(Some(c), self.peek(0), self.peek(1)) {
162
+ self.reconsume();
163
+ self.consume_ident_like_token()
164
+ } else {
165
+ Token::delim(c)
166
+ }
167
+ }
168
+ '<' => {
169
+ if self.peek(0) == Some('!') && self.peek(1) == Some('-') && self.peek(2) == Some('-') {
170
+ self.consume();
171
+ self.consume();
172
+ self.consume();
173
+ Token::bare(Kind::Cdo)
174
+ } else {
175
+ Token::delim(c)
176
+ }
177
+ }
178
+ '@' => {
179
+ if ident_sequence_starts(self.peek(0), self.peek(1), self.peek(2)) {
180
+ Token::with_str(Kind::AtKeyword, self.consume_ident_sequence())
181
+ } else {
182
+ Token::delim(c)
183
+ }
184
+ }
185
+ '\\' => {
186
+ if valid_escape(Some(c), self.peek(0)) {
187
+ self.reconsume();
188
+ self.consume_ident_like_token()
189
+ } else {
190
+ Token::delim(c)
191
+ }
192
+ }
193
+ '0'..='9' => {
194
+ self.reconsume();
195
+ self.consume_numeric_token()
196
+ }
197
+ _ => {
198
+ if is_ident_start_code_point(Some(c)) {
199
+ self.reconsume();
200
+ self.consume_ident_like_token()
201
+ } else {
202
+ Token::delim(c)
203
+ }
204
+ }
205
+ }
206
+ }
207
+
208
+ // --- comments --------------------------------------------------
209
+
210
+ fn consume_comments(&mut self) {
211
+ while self.peek(0) == Some('/') && self.peek(1) == Some('*') {
212
+ self.consume();
213
+ self.consume();
214
+
215
+ while !self.eof() {
216
+ if self.consume() == Some('*') && self.peek(0) == Some('/') {
217
+ self.consume();
218
+ break;
219
+ }
220
+ }
221
+ }
222
+ }
223
+
224
+ fn consume_comment_token(&mut self) -> Token {
225
+ self.consume();
226
+ self.consume();
227
+ let mut buf = String::new();
228
+
229
+ while !self.eof() {
230
+ let c = self.consume().unwrap();
231
+
232
+ if c == '*' && self.peek(0) == Some('/') {
233
+ self.consume();
234
+ break;
235
+ }
236
+
237
+ buf.push(c);
238
+ }
239
+
240
+ Token::with_str(Kind::Comment, buf)
241
+ }
242
+
243
+ fn consume_whitespace(&mut self) -> Token {
244
+ while is_whitespace_opt(self.peek(0)) {
245
+ self.consume();
246
+ }
247
+
248
+ Token::bare(Kind::Whitespace)
249
+ }
250
+
251
+ // --- strings ---------------------------------------------------
252
+
253
+ fn consume_string_token(&mut self, ending: char) -> Token {
254
+ let mut buf = String::new();
255
+
256
+ loop {
257
+ match self.consume() {
258
+ None => return Token::with_str(Kind::String_, buf),
259
+ Some(c) if c == ending => return Token::with_str(Kind::String_, buf),
260
+ Some('\n') => {
261
+ self.reconsume();
262
+ return Token::bare(Kind::BadString);
263
+ }
264
+ Some('\\') => {
265
+ let n = self.peek(0);
266
+
267
+ if n.is_none() {
268
+ continue;
269
+ } else if n == Some('\n') {
270
+ self.consume();
271
+ } else {
272
+ buf.push(self.consume_escaped_code_point());
273
+ }
274
+ }
275
+ Some(c) => buf.push(c),
276
+ }
277
+ }
278
+ }
279
+
280
+ // --- escape ----------------------------------------------------
281
+
282
+ fn consume_escaped_code_point(&mut self) -> char {
283
+ let c = match self.consume() {
284
+ None => return REPLACEMENT,
285
+ Some(c) => c,
286
+ };
287
+
288
+ if !is_hex_digit(Some(c)) {
289
+ return c;
290
+ }
291
+
292
+ let mut hex = String::with_capacity(6);
293
+ hex.push(c);
294
+
295
+ while hex.len() < 6 && is_hex_digit(self.peek(0)) {
296
+ hex.push(self.consume().unwrap());
297
+ }
298
+
299
+ if is_whitespace_opt(self.peek(0)) {
300
+ self.consume();
301
+ }
302
+
303
+ let n = u32::from_str_radix(&hex, 16).unwrap_or(0);
304
+
305
+ if n == 0 || (0xD800..=0xDFFF).contains(&n) || n > 0x10FFFF {
306
+ REPLACEMENT
307
+ } else {
308
+ char::from_u32(n).unwrap_or(REPLACEMENT)
309
+ }
310
+ }
311
+
312
+ // --- ident-like ------------------------------------------------
313
+
314
+ fn consume_ident_sequence(&mut self) -> String {
315
+ let mut buf = String::new();
316
+
317
+ loop {
318
+ let c = self.consume();
319
+
320
+ if is_ident_code_point(c) {
321
+ buf.push(c.unwrap());
322
+ } else if valid_escape(c, self.peek(0)) {
323
+ buf.push(self.consume_escaped_code_point());
324
+ } else {
325
+ if c.is_some() {
326
+ self.reconsume();
327
+ }
328
+ return buf;
329
+ }
330
+ }
331
+ }
332
+
333
+ fn consume_ident_like_token(&mut self) -> Token {
334
+ let name = self.consume_ident_sequence();
335
+
336
+ if name.eq_ignore_ascii_case("url") && self.peek(0) == Some('(') {
337
+ self.consume();
338
+
339
+ while is_whitespace_opt(self.peek(0)) && is_whitespace_opt(self.peek(1)) {
340
+ self.consume();
341
+ }
342
+
343
+ let n1 = self.peek(0);
344
+ let n2 = if is_whitespace_opt(n1) { self.peek(1) } else { n1 };
345
+
346
+ let is_quote = |c: Option<char>| c == Some('"') || c == Some('\'');
347
+
348
+ if is_quote(n1) || (is_whitespace_opt(n1) && is_quote(n2)) {
349
+ Token::with_str(Kind::Function, name)
350
+ } else {
351
+ self.consume_url_token()
352
+ }
353
+ } else if self.peek(0) == Some('(') {
354
+ self.consume();
355
+ Token::with_str(Kind::Function, name)
356
+ } else {
357
+ Token::with_str(Kind::Ident, name)
358
+ }
359
+ }
360
+
361
+ fn consume_url_token(&mut self) -> Token {
362
+ let mut buf = String::new();
363
+
364
+ while is_whitespace_opt(self.peek(0)) {
365
+ self.consume();
366
+ }
367
+
368
+ loop {
369
+ let c = self.consume();
370
+
371
+ match c {
372
+ None | Some(')') => return Token::with_str(Kind::Url, buf),
373
+ Some('"') | Some('\'') | Some('(') => {
374
+ self.consume_bad_url_remnants();
375
+ return Token::bare(Kind::BadUrl);
376
+ }
377
+ Some(' ') | Some('\t') | Some('\n') => {
378
+ while is_whitespace_opt(self.peek(0)) {
379
+ self.consume();
380
+ }
381
+
382
+ let n = self.peek(0);
383
+
384
+ if n.is_none() || n == Some(')') {
385
+ if n.is_some() {
386
+ self.consume();
387
+ }
388
+ return Token::with_str(Kind::Url, buf);
389
+ } else {
390
+ self.consume_bad_url_remnants();
391
+ return Token::bare(Kind::BadUrl);
392
+ }
393
+ }
394
+ Some('\\') => {
395
+ if valid_escape(c, self.peek(0)) {
396
+ buf.push(self.consume_escaped_code_point());
397
+ } else {
398
+ self.consume_bad_url_remnants();
399
+ return Token::bare(Kind::BadUrl);
400
+ }
401
+ }
402
+ Some(c) => {
403
+ if is_non_printable(c) {
404
+ self.consume_bad_url_remnants();
405
+ return Token::bare(Kind::BadUrl);
406
+ }
407
+ buf.push(c);
408
+ }
409
+ }
410
+ }
411
+ }
412
+
413
+ fn consume_bad_url_remnants(&mut self) {
414
+ loop {
415
+ let c = self.consume();
416
+
417
+ if c.is_none() || c == Some(')') {
418
+ return;
419
+ }
420
+
421
+ if valid_escape(c, self.peek(0)) {
422
+ self.consume_escaped_code_point();
423
+ }
424
+ }
425
+ }
426
+
427
+ // --- numbers ---------------------------------------------------
428
+
429
+ fn consume_numeric_token(&mut self) -> Token {
430
+ let (value, flag) = self.consume_number();
431
+
432
+ if ident_sequence_starts(self.peek(0), self.peek(1), self.peek(2)) {
433
+ let unit = self.consume_ident_sequence();
434
+
435
+ Token {
436
+ number_flag: Some(flag),
437
+ unit: Some(unit),
438
+ ..Self::with_number_value(Kind::Dimension, value, flag)
439
+ }
440
+ } else if self.peek(0) == Some('%') {
441
+ self.consume();
442
+ Self::with_number_value(Kind::Percentage, value, flag)
443
+ } else {
444
+ Token {
445
+ number_flag: Some(flag),
446
+ ..Self::with_number_value(Kind::Number, value, flag)
447
+ }
448
+ }
449
+ }
450
+
451
+ fn with_number_value(kind: Kind, value: TokenValue, _flag: NumberFlag) -> Token {
452
+ Token { value, ..Token::bare(kind) }
453
+ }
454
+
455
+ fn consume_number(&mut self) -> (TokenValue, NumberFlag) {
456
+ let mut repr = String::new();
457
+ let mut flag = NumberFlag::Integer;
458
+
459
+ if self.peek(0) == Some('+') || self.peek(0) == Some('-') {
460
+ repr.push(self.consume().unwrap());
461
+ }
462
+
463
+ while is_digit(self.peek(0)) {
464
+ repr.push(self.consume().unwrap());
465
+ }
466
+
467
+ if self.peek(0) == Some('.') && is_digit(self.peek(1)) {
468
+ repr.push(self.consume().unwrap());
469
+ while is_digit(self.peek(0)) {
470
+ repr.push(self.consume().unwrap());
471
+ }
472
+ flag = NumberFlag::Number;
473
+ }
474
+
475
+ let exp = self.peek(0);
476
+ let after_exp = self.peek(1);
477
+
478
+ if (exp == Some('E') || exp == Some('e'))
479
+ && (is_digit(after_exp)
480
+ || ((after_exp == Some('+') || after_exp == Some('-')) && is_digit(self.peek(2))))
481
+ {
482
+ repr.push(self.consume().unwrap());
483
+ if self.peek(0) == Some('+') || self.peek(0) == Some('-') {
484
+ repr.push(self.consume().unwrap());
485
+ }
486
+ while is_digit(self.peek(0)) {
487
+ repr.push(self.consume().unwrap());
488
+ }
489
+ flag = NumberFlag::Number;
490
+ }
491
+
492
+ let value = match flag {
493
+ NumberFlag::Integer => TokenValue::Int(repr.parse().unwrap_or(0)),
494
+ NumberFlag::Number => TokenValue::Float(repr.parse().unwrap_or(0.0)),
495
+ };
496
+
497
+ (value, flag)
498
+ }
499
+ }
500
+
501
+ // --- preprocessing ----------------------------------------------
502
+
503
+ fn preprocess(input: &str) -> Vec<char> {
504
+ let mut out = Vec::with_capacity(input.len());
505
+ let mut iter = input.chars().peekable();
506
+
507
+ while let Some(c) = iter.next() {
508
+ match c {
509
+ '\r' => {
510
+ out.push('\n');
511
+ if iter.peek() == Some(&'\n') {
512
+ iter.next();
513
+ }
514
+ }
515
+ '\x0C' => out.push('\n'),
516
+ '\0' => out.push(REPLACEMENT),
517
+ _ => out.push(c),
518
+ }
519
+ }
520
+
521
+ out
522
+ }
523
+
524
+ // --- code point classifiers -------------------------------------
525
+
526
+ fn is_whitespace(c: char) -> bool {
527
+ c == ' ' || c == '\n' || c == '\t'
528
+ }
529
+
530
+ fn is_whitespace_opt(c: Option<char>) -> bool {
531
+ matches!(c, Some(' ') | Some('\n') | Some('\t'))
532
+ }
533
+
534
+ fn is_digit(c: Option<char>) -> bool {
535
+ matches!(c, Some('0'..='9'))
536
+ }
537
+
538
+ fn is_hex_digit(c: Option<char>) -> bool {
539
+ matches!(c, Some('0'..='9' | 'A'..='F' | 'a'..='f'))
540
+ }
541
+
542
+ fn is_ident_start_code_point(c: Option<char>) -> bool {
543
+ match c {
544
+ Some(c) if c.is_ascii_alphabetic() => true,
545
+ Some('_') => true,
546
+ Some(c) if (c as u32) >= 0x80 => true,
547
+ _ => false,
548
+ }
549
+ }
550
+
551
+ fn is_ident_code_point(c: Option<char>) -> bool {
552
+ is_ident_start_code_point(c) || is_digit(c) || c == Some('-')
553
+ }
554
+
555
+ fn is_non_printable(c: char) -> bool {
556
+ let o = c as u32;
557
+ o <= 0x08 || o == 0x0B || (0x0E..=0x1F).contains(&o) || o == 0x7F
558
+ }
559
+
560
+ // §4.3.8
561
+ fn valid_escape(c1: Option<char>, c2: Option<char>) -> bool {
562
+ c1 == Some('\\') && c2.is_some() && c2 != Some('\n')
563
+ }
564
+
565
+ // §4.3.9
566
+ fn ident_sequence_starts(c1: Option<char>, c2: Option<char>, c3: Option<char>) -> bool {
567
+ match c1 {
568
+ Some('-') => is_ident_start_code_point(c2) || c2 == Some('-') || valid_escape(c2, c3),
569
+ Some('\\') => valid_escape(c1, c2),
570
+ _ => is_ident_start_code_point(c1),
571
+ }
572
+ }
573
+
574
+ // §4.3.10
575
+ fn number_starts(c1: Option<char>, c2: Option<char>, c3: Option<char>) -> bool {
576
+ match c1 {
577
+ Some('+') | Some('-') => is_digit(c2) || (c2 == Some('.') && is_digit(c3)),
578
+ Some('.') => is_digit(c2),
579
+ _ => is_digit(c1),
580
+ }
581
+ }
582
+
583
+ fn punctuation_kind(c: char) -> Option<Kind> {
584
+ Some(match c {
585
+ '(' => Kind::LParen,
586
+ ')' => Kind::RParen,
587
+ ',' => Kind::Comma,
588
+ ':' => Kind::Colon,
589
+ ';' => Kind::Semicolon,
590
+ '[' => Kind::LBracket,
591
+ ']' => Kind::RBracket,
592
+ '{' => Kind::LBrace,
593
+ '}' => Kind::RBrace,
594
+ _ => return None,
595
+ })
596
+ }