cataract 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,7 +11,7 @@
11
11
  # Do NOT refactor to "clean Ruby" without benchmarking - you will make it slower.
12
12
  #
13
13
  # Example: RuboCop suggests using `.positive?` instead of `> 0`, but benchmarking
14
- # shows `> 0` is 1.26x faster (see benchmark_positive.rb). These micro-optimizations
14
+ # shows `> 0` is 1.26x faster. These micro-optimizations
15
15
  # matter in a hot parsing loop.
16
16
 
17
17
  module Cataract
@@ -65,15 +65,25 @@ module Cataract
65
65
  true
66
66
  end
67
67
 
68
- def initialize(css_string, parent_media_sym: nil, depth: 0)
68
+ def initialize(css_string, parser_options: {}, parent_media_sym: nil, depth: 0)
69
69
  @css = css_string.dup.freeze
70
70
  @pos = 0
71
71
  @len = @css.bytesize
72
72
  @parent_media_sym = parent_media_sym
73
73
 
74
+ # Parser options with defaults
75
+ @parser_options = {
76
+ selector_lists: true
77
+ }.merge(parser_options)
78
+
79
+ # Extract selector_lists option to ivar to avoid repeated hash lookups in hot path
80
+ @selector_lists_enabled = @parser_options[:selector_lists]
81
+
74
82
  # Parser state
75
83
  @rules = [] # Flat array of Rule structs
76
84
  @_media_index = {} # Symbol => Array of rule IDs
85
+ @_selector_lists = {} # Hash: list_id => Array of rule IDs
86
+ @_next_selector_list_id = 0 # Counter for selector list IDs
77
87
  @imports = [] # Array of ImportStatement structs
78
88
  @rule_id_counter = 0 # Next rule ID (0-indexed)
79
89
  @media_query_count = 0 # Safety limit
@@ -103,7 +113,9 @@ module Cataract
103
113
  # Must be a selector-based rule
104
114
  selector = parse_selector
105
115
 
106
- next if selector.nil? || selector.empty?
116
+ if selector.nil? || selector.empty?
117
+ next
118
+ end
107
119
 
108
120
  # Find the block boundaries
109
121
  decl_start = @pos # Should be right after the {
@@ -159,22 +171,46 @@ module Cataract
159
171
  # Split comma-separated selectors into individual rules
160
172
  selectors = selector.split(',')
161
173
 
174
+ # Determine if we should track this as a selector list
175
+ # Check boolean first to potentially avoid size() call via short-circuit evaluation
176
+ list_id = nil
177
+ if @selector_lists_enabled && selectors.size > 1
178
+ list_id = @_next_selector_list_id
179
+ @_next_selector_list_id += 1
180
+ @_selector_lists[list_id] = []
181
+ end
182
+
162
183
  selectors.each do |individual_selector|
163
184
  individual_selector.strip!
164
185
  next if individual_selector.empty?
165
186
 
166
- # Create Rule struct
187
+ rule_id = @rule_id_counter
188
+
189
+ # Dup declarations for each rule in a selector list to avoid shared state
190
+ # (principle of least surprise - modifying one rule shouldn't affect others)
191
+ # Must deep dup: both the array and the Declaration objects inside
192
+ rule_declarations = if list_id
193
+ declarations.map { |d| Declaration.new(d.property, d.value, d.important) }
194
+ else
195
+ declarations
196
+ end
197
+
198
+ # Create Rule struct (with selector_list_id as 7th parameter)
167
199
  rule = Rule.new(
168
- @rule_id_counter, # id
200
+ rule_id, # id
169
201
  individual_selector, # selector
170
- declarations, # declarations
202
+ rule_declarations, # declarations
171
203
  nil, # specificity (calculated lazily)
172
204
  nil, # parent_rule_id
173
- nil # nesting_style
205
+ nil, # nesting_style
206
+ list_id # selector_list_id
174
207
  )
175
208
 
176
209
  @rules << rule
177
210
  @rule_id_counter += 1
211
+
212
+ # Track in selector list if applicable
213
+ @_selector_lists[list_id] << rule_id if list_id
178
214
  end
179
215
  end
180
216
  end
@@ -182,6 +218,7 @@ module Cataract
182
218
  {
183
219
  rules: @rules,
184
220
  _media_index: @_media_index,
221
+ _selector_lists: @_selector_lists,
185
222
  imports: @imports,
186
223
  charset: @charset,
187
224
  _has_nesting: @_has_nesting
@@ -238,17 +275,108 @@ module Cataract
238
275
  true
239
276
  end
240
277
 
241
- # Skip whitespace and comments
278
+ # Skip whitespace and comments until no more progress can be made
279
+ #
280
+ # Optimization: Using `begin...end until` instead of `loop + break` reduces VM overhead:
281
+ # - loop + break: 29 instructions with catch table for break/redo/next, uses throw/send
282
+ # - begin...end until: 24 instructions, simple jump-based loop, no catch table
283
+ # Benchmark shows 15-51% speedup depending on YJIT
242
284
  def skip_ws_and_comments
243
- loop do
285
+ begin
244
286
  old_pos = @pos
245
287
  skip_whitespace
246
288
  skip_comment
247
- break if @pos == old_pos # No progress made
289
+ end until @pos == old_pos # No progress made # rubocop:disable Lint/Loop
290
+ end
291
+
292
+ # Parse a single CSS declaration (property: value)
293
+ #
294
+ # Performance-critical helper that parses one declaration.
295
+ # Shared by parse_mixed_block, parse_declarations, and parse_declarations_block.
296
+ #
297
+ # @param pos [Integer] Current position in CSS string
298
+ # @param end_pos [Integer] End position (boundary for parsing)
299
+ # @param parse_important [Boolean] Whether to parse !important flag (false for at-rules)
300
+ # @return [Array(Declaration|nil, Integer)] Tuple of [declaration, new_position]
301
+ def parse_single_declaration(pos, end_pos, parse_important)
302
+ # Parse property name (scan until ':')
303
+ prop_start = pos
304
+ while pos < end_pos && @css.getbyte(pos) != BYTE_COLON &&
305
+ @css.getbyte(pos) != BYTE_SEMICOLON && @css.getbyte(pos) != BYTE_RBRACE
306
+ pos += 1
307
+ end
308
+
309
+ # Skip if malformed (no colon found)
310
+ if pos >= end_pos || @css.getbyte(pos) != BYTE_COLON
311
+ # Error recovery: skip to next semicolon
312
+ while pos < end_pos && @css.getbyte(pos) != BYTE_SEMICOLON
313
+ pos += 1
314
+ end
315
+ pos += 1 if pos < end_pos && @css.getbyte(pos) == BYTE_SEMICOLON
316
+ return [nil, pos]
317
+ end
318
+
319
+ # Trim trailing whitespace from property
320
+ prop_end = pos
321
+ while prop_end > prop_start && whitespace?(@css.getbyte(prop_end - 1))
322
+ prop_end -= 1
323
+ end
324
+
325
+ # Extract and normalize property name
326
+ property = byteslice_encoded(prop_start, prop_end - prop_start)
327
+ # Custom properties (--foo) are case-sensitive and can contain Unicode
328
+ # Regular properties are ASCII-only and case-insensitive
329
+ unless property.bytesize >= 2 && property.getbyte(0) == BYTE_HYPHEN && property.getbyte(1) == BYTE_HYPHEN
330
+ property.force_encoding('US-ASCII')
331
+ property.downcase!
332
+ end
333
+
334
+ pos += 1 # Skip ':'
335
+
336
+ # Skip leading whitespace in value
337
+ while pos < end_pos && whitespace?(@css.getbyte(pos))
338
+ pos += 1
339
+ end
340
+
341
+ # Parse value (scan until ';' or '}')
342
+ val_start = pos
343
+ while pos < end_pos && @css.getbyte(pos) != BYTE_SEMICOLON && @css.getbyte(pos) != BYTE_RBRACE
344
+ pos += 1
345
+ end
346
+ val_end = pos
347
+
348
+ # Trim trailing whitespace from value
349
+ while val_end > val_start && whitespace?(@css.getbyte(val_end - 1))
350
+ val_end -= 1
351
+ end
352
+
353
+ value = byteslice_encoded(val_start, val_end - val_start)
354
+
355
+ # Parse !important flag if requested
356
+ important = false
357
+ if parse_important && value.end_with?('!important')
358
+ important = true
359
+ # Remove '!important' and trailing whitespace
360
+ value = value[0, value.length - 10].rstrip
248
361
  end
362
+
363
+ # Skip semicolon if present
364
+ pos += 1 if pos < end_pos && @css.getbyte(pos) == BYTE_SEMICOLON
365
+
366
+ # Return nil if empty declaration
367
+ return [nil, pos] if prop_end <= prop_start || val_end <= val_start
368
+
369
+ [Declaration.new(property, value, important), pos]
249
370
  end
250
371
 
251
372
  # Find matching closing brace
373
+ #
374
+ # Performance notes (benchmarked on bootstrap.css with 2,400 braces):
375
+ # - Using `return` instead of `break` avoids catch table overhead (~2% faster)
376
+ # - Checking RBRACE before LBRACE is faster because closing braces are
377
+ # encountered more frequently when searching forward from an opening brace
378
+ # - Combined optimizations: baseline 666ms → optimized 652ms (2% improvement)
379
+ #
252
380
  # Translated from C: see ext/cataract/css_parser.c find_matching_brace
253
381
  def find_matching_brace(start_pos)
254
382
  depth = 1
@@ -256,11 +384,11 @@ module Cataract
256
384
 
257
385
  while pos < @len
258
386
  byte = @css.getbyte(pos)
259
- if byte == BYTE_LBRACE
260
- depth += 1
261
- elsif byte == BYTE_RBRACE
387
+ if byte == BYTE_RBRACE
262
388
  depth -= 1
263
- break if depth == 0 # Found matching brace, exit immediately
389
+ return pos if depth == 0
390
+ elsif byte == BYTE_LBRACE
391
+ depth += 1
264
392
  end
265
393
  pos += 1
266
394
  end
@@ -288,6 +416,7 @@ module Cataract
288
416
 
289
417
  # Trim whitespace from selector (in-place to avoid allocation)
290
418
  selector_text.strip!
419
+ selector_text
291
420
  end
292
421
 
293
422
  # Parse mixed block containing declarations AND nested selectors/at-rules
@@ -458,64 +587,9 @@ module Cataract
458
587
  next
459
588
  end
460
589
 
461
- # This is a declaration - parse it
462
- prop_start = pos
463
- while pos < end_pos && @css.getbyte(pos) != BYTE_COLON &&
464
- @css.getbyte(pos) != BYTE_SEMICOLON && @css.getbyte(pos) != BYTE_LBRACE
465
- pos += 1
466
- end
467
-
468
- if pos >= end_pos || @css.getbyte(pos) != BYTE_COLON
469
- # Malformed - skip to semicolon
470
- while pos < end_pos && @css.getbyte(pos) != BYTE_SEMICOLON
471
- pos += 1
472
- end
473
- pos += 1 if pos < end_pos
474
- next
475
- end
476
-
477
- prop_end = pos
478
- # Trim trailing whitespace
479
- while prop_end > prop_start && whitespace?(@css.getbyte(prop_end - 1))
480
- prop_end -= 1
481
- end
482
-
483
- property = byteslice_encoded(prop_start, prop_end - prop_start, encoding: 'US-ASCII')
484
- property.downcase!
485
-
486
- pos += 1 # Skip :
487
-
488
- # Skip leading whitespace in value
489
- while pos < end_pos && whitespace?(@css.getbyte(pos))
490
- pos += 1
491
- end
492
-
493
- # Parse value (read until ';' or '}')
494
- val_start = pos
495
- while pos < end_pos && @css.getbyte(pos) != BYTE_SEMICOLON && @css.getbyte(pos) != BYTE_RBRACE
496
- pos += 1
497
- end
498
- val_end = pos
499
-
500
- # Trim trailing whitespace from value
501
- while val_end > val_start && whitespace?(@css.getbyte(val_end - 1))
502
- val_end -= 1
503
- end
504
-
505
- value = byteslice_encoded(val_start, val_end - val_start)
506
-
507
- # Check for !important flag
508
- important = false
509
- if value.end_with?('!important')
510
- important = true
511
- # NOTE: Using rstrip here instead of manual byte loop since !important is rare (not hot path)
512
- value = value[0, value.length - 10].rstrip # Remove '!important' and trailing whitespace
513
- end
514
-
515
- pos += 1 if pos < end_pos && @css.getbyte(pos) == BYTE_SEMICOLON
516
-
517
- # Create declaration
518
- declarations << Declaration.new(property, value, important) if prop_end > prop_start && val_end > val_start
590
+ # This is a declaration - parse it using shared helper
591
+ decl, pos = parse_single_declaration(pos, end_pos, true)
592
+ declarations << decl if decl
519
593
  end
520
594
 
521
595
  declarations
@@ -553,20 +627,44 @@ module Cataract
553
627
  next
554
628
  end
555
629
 
556
- property = byteslice_encoded(property_start, @pos - property_start, encoding: 'US-ASCII')
630
+ # Extract property name - use UTF-8 encoding to support custom properties with Unicode
631
+ property = byteslice_encoded(property_start, @pos - property_start)
557
632
  property.strip!
558
- property.downcase!
633
+ # Custom properties (--foo) are case-sensitive and can contain Unicode
634
+ # Regular properties are ASCII-only and case-insensitive
635
+ unless property.bytesize >= 2 && property.getbyte(0) == BYTE_HYPHEN && property.getbyte(1) == BYTE_HYPHEN
636
+ # Regular property: force ASCII encoding and downcase
637
+ property.force_encoding('US-ASCII')
638
+ property.downcase!
639
+ end
559
640
  @pos += 1 # skip ':'
560
641
 
561
642
  skip_ws_and_comments
562
643
 
563
- # Parse value (read until ';' or '}')
644
+ # Parse value (read until ';' or '}', but respect quoted strings)
564
645
  value_start = @pos
565
646
  important = false
647
+ in_quote = nil # nil, BYTE_SQUOTE, or BYTE_DQUOTE
566
648
 
567
649
  until eof?
568
650
  byte = peek_byte
569
- break if byte == BYTE_SEMICOLON || byte == BYTE_RBRACE
651
+
652
+ if in_quote
653
+ # Inside quoted string - only exit on matching quote
654
+ if byte == in_quote
655
+ in_quote = nil
656
+ elsif byte == BYTE_BACKSLASH && @pos + 1 < @len
657
+ # Skip escaped character
658
+ @pos += 1
659
+ end
660
+ else
661
+ # Not in quote - check for terminators or quote start
662
+ break if byte == BYTE_SEMICOLON || byte == BYTE_RBRACE
663
+
664
+ if byte == BYTE_SQUOTE || byte == BYTE_DQUOTE
665
+ in_quote = byte
666
+ end
667
+ end
570
668
 
571
669
  @pos += 1
572
670
  end
@@ -587,7 +685,7 @@ module Cataract
587
685
  end
588
686
 
589
687
  # Check for 'important' (9 chars)
590
- if i >= 8 && value[(i - 8)..i] == 'important'
688
+ if i >= 8 && value[(i - 8), 9] == 'important'
591
689
  i -= 9
592
690
  # Skip whitespace before 'important'
593
691
  while i >= 0
@@ -644,16 +742,8 @@ module Cataract
644
742
 
645
743
  charset_value = byteslice_encoded(value_start, @pos - value_start)
646
744
  charset_value.strip!
647
- # Remove quotes (byte-by-byte)
648
- result = String.new
649
- i = 0
650
- len = charset_value.bytesize
651
- while i < len
652
- byte = charset_value.getbyte(i)
653
- result << charset_value[i] unless byte == BYTE_DQUOTE || byte == BYTE_SQUOTE
654
- i += 1
655
- end
656
- @charset = result
745
+ # Remove quotes
746
+ @charset = charset_value.delete('"\'')
657
747
 
658
748
  @pos += 1 if peek_byte == BYTE_SEMICOLON # consume semicolon
659
749
  return
@@ -702,11 +792,24 @@ module Cataract
702
792
  # Recursively parse block content (preserve parent media context)
703
793
  nested_parser = Parser.new(
704
794
  byteslice_encoded(block_start, block_end - block_start),
705
- parent_media_sym: @parent_media_sym, depth: @depth + 1
795
+ parser_options: @parser_options,
796
+ parent_media_sym: @parent_media_sym,
797
+ depth: @depth + 1
706
798
  )
707
799
 
708
800
  nested_result = nested_parser.parse
709
801
 
802
+ # Merge nested selector_lists with offsetted IDs
803
+ list_id_offset = @_next_selector_list_id
804
+ if nested_result[:_selector_lists] && !nested_result[:_selector_lists].empty?
805
+ nested_result[:_selector_lists].each do |list_id, rule_ids|
806
+ new_list_id = list_id + list_id_offset
807
+ offsetted_rule_ids = rule_ids.map { |rid| rid + @rule_id_counter }
808
+ @_selector_lists[new_list_id] = offsetted_rule_ids
809
+ end
810
+ @_next_selector_list_id = list_id_offset + nested_result[:_selector_lists].size
811
+ end
812
+
710
813
  # Merge nested media_index into ours
711
814
  nested_result[:_media_index].each do |media, rule_ids|
712
815
  @_media_index[media] ||= []
@@ -717,6 +820,10 @@ module Cataract
717
820
  # Add nested rules to main rules array
718
821
  nested_result[:rules].each do |rule|
719
822
  rule.id = @rule_id_counter
823
+ # Update selector_list_id if applicable
824
+ if rule.is_a?(Rule) && rule.selector_list_id
825
+ rule.selector_list_id += list_id_offset
826
+ end
720
827
  @rule_id_counter += 1
721
828
  @rules << rule
722
829
  end
@@ -776,12 +883,24 @@ module Cataract
776
883
  # Parse the content with the combined media context
777
884
  nested_parser = Parser.new(
778
885
  byteslice_encoded(block_start, block_end - block_start),
886
+ parser_options: @parser_options,
779
887
  parent_media_sym: combined_media_sym,
780
888
  depth: @depth + 1
781
889
  )
782
890
 
783
891
  nested_result = nested_parser.parse
784
892
 
893
+ # Merge nested selector_lists with offsetted IDs
894
+ list_id_offset = @_next_selector_list_id
895
+ if nested_result[:_selector_lists] && !nested_result[:_selector_lists].empty?
896
+ nested_result[:_selector_lists].each do |list_id, rule_ids|
897
+ new_list_id = list_id + list_id_offset
898
+ offsetted_rule_ids = rule_ids.map { |rid| rid + @rule_id_counter }
899
+ @_selector_lists[new_list_id] = offsetted_rule_ids
900
+ end
901
+ @_next_selector_list_id = list_id_offset + nested_result[:_selector_lists].size
902
+ end
903
+
785
904
  # Merge nested media_index into ours (for nested @media)
786
905
  nested_result[:_media_index].each do |media, rule_ids|
787
906
  @_media_index[media] ||= []
@@ -792,6 +911,10 @@ module Cataract
792
911
  # Add nested rules to main rules array and update media_index
793
912
  nested_result[:rules].each do |rule|
794
913
  rule.id = @rule_id_counter
914
+ # Update selector_list_id if applicable
915
+ if rule.is_a?(Rule) && rule.selector_list_id
916
+ rule.selector_list_id += list_id_offset
917
+ end
795
918
 
796
919
  # Extract media types and add to each first (if different from full query)
797
920
  # We add these BEFORE the full query so that when iterating the media_index hash,
@@ -856,7 +979,11 @@ module Cataract
856
979
 
857
980
  # Parse keyframe blocks as rules (0%/from/to etc)
858
981
  # Create a nested parser context
859
- nested_parser = Parser.new(byteslice_encoded(block_start, block_end - block_start), depth: @depth + 1)
982
+ nested_parser = Parser.new(
983
+ byteslice_encoded(block_start, block_end - block_start),
984
+ parser_options: @parser_options,
985
+ depth: @depth + 1
986
+ )
860
987
  nested_result = nested_parser.parse
861
988
  content = nested_result[:rules]
862
989
 
@@ -1096,7 +1223,7 @@ module Cataract
1096
1223
  result = String.new
1097
1224
  result << parent_selector
1098
1225
  result << ' '
1099
- result << nested_selector.byteslice(start_pos..-1)
1226
+ result << nested_selector.byteslice(start_pos, nested_selector.bytesize - start_pos)
1100
1227
 
1101
1228
  [result, nesting_style]
1102
1229
  end
@@ -1120,7 +1247,8 @@ module Cataract
1120
1247
  # If child is a condition (contains ':'), wrap it in parentheses
1121
1248
  combined += if child_str.include?(':')
1122
1249
  # Add parens if not already present
1123
- if child_str.start_with?('(') && child_str.end_with?(')')
1250
+ len = child_str.bytesize
1251
+ if len > 1 && child_str.getbyte(0) == BYTE_LPAREN && child_str.getbyte(len - 1) == BYTE_RPAREN
1124
1252
  child_str
1125
1253
  else
1126
1254
  "(#{child_str})"
@@ -1282,56 +1410,9 @@ module Cataract
1282
1410
  end
1283
1411
  break if pos >= end_pos
1284
1412
 
1285
- # Parse property name (read until ':')
1286
- prop_start = pos
1287
- while pos < end_pos && @css.getbyte(pos) != BYTE_COLON && @css.getbyte(pos) != BYTE_SEMICOLON && @css.getbyte(pos) != BYTE_RBRACE
1288
- pos += 1
1289
- end
1290
-
1291
- # Skip if no colon found (malformed)
1292
- if pos >= end_pos || @css.getbyte(pos) != BYTE_COLON
1293
- # Try to recover by finding next semicolon
1294
- while pos < end_pos && @css.getbyte(pos) != BYTE_SEMICOLON
1295
- pos += 1
1296
- end
1297
- pos += 1 if pos < end_pos && @css.getbyte(pos) == BYTE_SEMICOLON
1298
- next
1299
- end
1300
-
1301
- prop_end = pos
1302
- # Trim trailing whitespace from property
1303
- while prop_end > prop_start && whitespace?(@css.getbyte(prop_end - 1))
1304
- prop_end -= 1
1305
- end
1306
-
1307
- property = byteslice_encoded(prop_start, prop_end - prop_start, encoding: 'US-ASCII')
1308
- property.downcase!
1309
-
1310
- pos += 1 # Skip ':'
1311
-
1312
- # Skip leading whitespace in value
1313
- while pos < end_pos && whitespace?(@css.getbyte(pos))
1314
- pos += 1
1315
- end
1316
-
1317
- # Parse value (read until ';' or '}')
1318
- val_start = pos
1319
- while pos < end_pos && @css.getbyte(pos) != BYTE_SEMICOLON && @css.getbyte(pos) != BYTE_RBRACE
1320
- pos += 1
1321
- end
1322
- val_end = pos
1323
-
1324
- # Trim trailing whitespace from value
1325
- while val_end > val_start && whitespace?(@css.getbyte(val_end - 1))
1326
- val_end -= 1
1327
- end
1328
-
1329
- value = byteslice_encoded(val_start, val_end - val_start)
1330
-
1331
- pos += 1 if pos < end_pos && @css.getbyte(pos) == BYTE_SEMICOLON
1332
-
1333
- # Create Declaration struct (at-rules don't use !important)
1334
- declarations << Declaration.new(property, value, false)
1413
+ # Parse declaration using shared helper (at-rules don't use !important)
1414
+ decl, pos = parse_single_declaration(pos, end_pos, false)
1415
+ declarations << decl if decl
1335
1416
  end
1336
1417
 
1337
1418
  declarations