RubyGems - cataract - Versions diffs - 0.2.1 → 0.2.3 - Mend

cataract 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/.github/workflows/ci.yml +1 -1
data/.rubocop.yml +2 -0
data/BENCHMARKS.md +41 -38
data/CHANGELOG.md +16 -0
data/README.md +9 -3
data/ext/cataract/cataract.c +273 -92
data/ext/cataract/cataract.h +4 -3
data/ext/cataract/css_parser.c +125 -11
data/ext/cataract/flatten.c +271 -16
data/lib/cataract/declaration.rb +19 -0
data/lib/cataract/pure/flatten.rb +103 -8
data/lib/cataract/pure/parser.rb +222 -141
data/lib/cataract/pure/serializer.rb +217 -115
data/lib/cataract/pure.rb +4 -2
data/lib/cataract/rule.rb +39 -3
data/lib/cataract/stylesheet.rb +137 -14
data/lib/cataract/stylesheet_scope.rb +11 -4
data/lib/cataract/version.rb +1 -1
metadata +1 -1

data/lib/cataract/pure/parser.rb CHANGED Viewed

@@ -11,7 +11,7 @@
 # Do NOT refactor to "clean Ruby" without benchmarking - you will make it slower.
 #
 # Example: RuboCop suggests using `.positive?` instead of `> 0`, but benchmarking
-# shows `> 0` is 1.26x faster (see benchmark_positive.rb). These micro-optimizations
+# shows `> 0` is 1.26x faster. These micro-optimizations
 # matter in a hot parsing loop.
 module Cataract
@@ -65,15 +65,25 @@ module Cataract
       true
     end
-    def initialize(css_string, parent_media_sym: nil, depth: 0)
+    def initialize(css_string, parser_options: {}, parent_media_sym: nil, depth: 0)
       @css = css_string.dup.freeze
       @pos = 0
       @len = @css.bytesize
       @parent_media_sym = parent_media_sym
+      # Parser options with defaults
+      @parser_options = {
+        selector_lists: true
+      }.merge(parser_options)
+      # Extract selector_lists option to ivar to avoid repeated hash lookups in hot path
+      @selector_lists_enabled = @parser_options[:selector_lists]
       # Parser state
       @rules = []                    # Flat array of Rule structs
       @_media_index = {}             # Symbol => Array of rule IDs
+      @_selector_lists = {}          # Hash: list_id => Array of rule IDs
+      @_next_selector_list_id = 0    # Counter for selector list IDs
       @imports = []                  # Array of ImportStatement structs
       @rule_id_counter = 0           # Next rule ID (0-indexed)
       @media_query_count = 0         # Safety limit
@@ -103,7 +113,9 @@ module Cataract
         # Must be a selector-based rule
         selector = parse_selector
-        next if selector.nil? || selector.empty?
+        if selector.nil? || selector.empty?
+          next
+        end
         # Find the block boundaries
         decl_start = @pos # Should be right after the {
@@ -159,22 +171,46 @@ module Cataract
           # Split comma-separated selectors into individual rules
           selectors = selector.split(',')
+          # Determine if we should track this as a selector list
+          # Check boolean first to potentially avoid size() call via short-circuit evaluation
+          list_id = nil
+          if @selector_lists_enabled && selectors.size > 1
+            list_id = @_next_selector_list_id
+            @_next_selector_list_id += 1
+            @_selector_lists[list_id] = []
+          end
           selectors.each do |individual_selector|
             individual_selector.strip!
             next if individual_selector.empty?
-            # Create Rule struct
+            rule_id = @rule_id_counter
+            # Dup declarations for each rule in a selector list to avoid shared state
+            # (principle of least surprise - modifying one rule shouldn't affect others)
+            # Must deep dup: both the array and the Declaration objects inside
+            rule_declarations = if list_id
+                                  declarations.map { |d| Declaration.new(d.property, d.value, d.important) }
+                                else
+                                  declarations
+                                end
+            # Create Rule struct (with selector_list_id as 7th parameter)
             rule = Rule.new(
-              @rule_id_counter,    # id
+              rule_id,             # id
               individual_selector, # selector
-              declarations,        # declarations
+              rule_declarations,   # declarations
               nil,                 # specificity (calculated lazily)
               nil,                 # parent_rule_id
-              nil                  # nesting_style
+              nil,                 # nesting_style
+              list_id              # selector_list_id
             )
             @rules << rule
             @rule_id_counter += 1
+            # Track in selector list if applicable
+            @_selector_lists[list_id] << rule_id if list_id
           end
         end
       end
@@ -182,6 +218,7 @@ module Cataract
       {
         rules: @rules,
         _media_index: @_media_index,
+        _selector_lists: @_selector_lists,
         imports: @imports,
         charset: @charset,
         _has_nesting: @_has_nesting
@@ -238,17 +275,108 @@ module Cataract
       true
     end
-    # Skip whitespace and comments
+    # Skip whitespace and comments until no more progress can be made
+    #
+    # Optimization: Using `begin...end until` instead of `loop + break` reduces VM overhead:
+    # - loop + break: 29 instructions with catch table for break/redo/next, uses throw/send
+    # - begin...end until: 24 instructions, simple jump-based loop, no catch table
+    # Benchmark shows 15-51% speedup depending on YJIT
     def skip_ws_and_comments
-      loop do
+      begin
         old_pos = @pos
         skip_whitespace
         skip_comment
-        break if @pos == old_pos # No progress made
+      end until @pos == old_pos # No progress made # rubocop:disable Lint/Loop
+    end
+    # Parse a single CSS declaration (property: value)
+    #
+    # Performance-critical helper that parses one declaration.
+    # Shared by parse_mixed_block, parse_declarations, and parse_declarations_block.
+    #
+    # @param pos [Integer] Current position in CSS string
+    # @param end_pos [Integer] End position (boundary for parsing)
+    # @param parse_important [Boolean] Whether to parse !important flag (false for at-rules)
+    # @return [Array(Declaration|nil, Integer)] Tuple of [declaration, new_position]
+    def parse_single_declaration(pos, end_pos, parse_important)
+      # Parse property name (scan until ':')
+      prop_start = pos
+      while pos < end_pos && @css.getbyte(pos) != BYTE_COLON &&
+            @css.getbyte(pos) != BYTE_SEMICOLON && @css.getbyte(pos) != BYTE_RBRACE
+        pos += 1
+      end
+      # Skip if malformed (no colon found)
+      if pos >= end_pos || @css.getbyte(pos) != BYTE_COLON
+        # Error recovery: skip to next semicolon
+        while pos < end_pos && @css.getbyte(pos) != BYTE_SEMICOLON
+          pos += 1
+        end
+        pos += 1 if pos < end_pos && @css.getbyte(pos) == BYTE_SEMICOLON
+        return [nil, pos]
+      end
+      # Trim trailing whitespace from property
+      prop_end = pos
+      while prop_end > prop_start && whitespace?(@css.getbyte(prop_end - 1))
+        prop_end -= 1
+      end
+      # Extract and normalize property name
+      property = byteslice_encoded(prop_start, prop_end - prop_start)
+      # Custom properties (--foo) are case-sensitive and can contain Unicode
+      # Regular properties are ASCII-only and case-insensitive
+      unless property.bytesize >= 2 && property.getbyte(0) == BYTE_HYPHEN && property.getbyte(1) == BYTE_HYPHEN
+        property.force_encoding('US-ASCII')
+        property.downcase!
+      end
+      pos += 1 # Skip ':'
+      # Skip leading whitespace in value
+      while pos < end_pos && whitespace?(@css.getbyte(pos))
+        pos += 1
+      end
+      # Parse value (scan until ';' or '}')
+      val_start = pos
+      while pos < end_pos && @css.getbyte(pos) != BYTE_SEMICOLON && @css.getbyte(pos) != BYTE_RBRACE
+        pos += 1
+      end
+      val_end = pos
+      # Trim trailing whitespace from value
+      while val_end > val_start && whitespace?(@css.getbyte(val_end - 1))
+        val_end -= 1
+      end
+      value = byteslice_encoded(val_start, val_end - val_start)
+      # Parse !important flag if requested
+      important = false
+      if parse_important && value.end_with?('!important')
+        important = true
+        # Remove '!important' and trailing whitespace
+        value = value[0, value.length - 10].rstrip
       end
+      # Skip semicolon if present
+      pos += 1 if pos < end_pos && @css.getbyte(pos) == BYTE_SEMICOLON
+      # Return nil if empty declaration
+      return [nil, pos] if prop_end <= prop_start || val_end <= val_start
+      [Declaration.new(property, value, important), pos]
     end
     # Find matching closing brace
+    #
+    # Performance notes (benchmarked on bootstrap.css with 2,400 braces):
+    # - Using `return` instead of `break` avoids catch table overhead (~2% faster)
+    # - Checking RBRACE before LBRACE is faster because closing braces are
+    #   encountered more frequently when searching forward from an opening brace
+    # - Combined optimizations: baseline 666ms → optimized 652ms (2% improvement)
+    #
     # Translated from C: see ext/cataract/css_parser.c find_matching_brace
     def find_matching_brace(start_pos)
       depth = 1
@@ -256,11 +384,11 @@ module Cataract
       while pos < @len
         byte = @css.getbyte(pos)
-        if byte == BYTE_LBRACE
-          depth += 1
-        elsif byte == BYTE_RBRACE
+        if byte == BYTE_RBRACE
           depth -= 1
-          break if depth == 0 # Found matching brace, exit immediately
+          return pos if depth == 0
+        elsif byte == BYTE_LBRACE
+          depth += 1
         end
         pos += 1
       end
@@ -288,6 +416,7 @@ module Cataract
       # Trim whitespace from selector (in-place to avoid allocation)
       selector_text.strip!
+      selector_text
     end
     # Parse mixed block containing declarations AND nested selectors/at-rules
@@ -458,64 +587,9 @@ module Cataract
           next
         end
-        # This is a declaration - parse it
-        prop_start = pos
-        while pos < end_pos && @css.getbyte(pos) != BYTE_COLON &&
-              @css.getbyte(pos) != BYTE_SEMICOLON && @css.getbyte(pos) != BYTE_LBRACE
-          pos += 1
-        end
-        if pos >= end_pos || @css.getbyte(pos) != BYTE_COLON
-          # Malformed - skip to semicolon
-          while pos < end_pos && @css.getbyte(pos) != BYTE_SEMICOLON
-            pos += 1
-          end
-          pos += 1 if pos < end_pos
-          next
-        end
-        prop_end = pos
-        # Trim trailing whitespace
-        while prop_end > prop_start && whitespace?(@css.getbyte(prop_end - 1))
-          prop_end -= 1
-        end
-        property = byteslice_encoded(prop_start, prop_end - prop_start, encoding: 'US-ASCII')
-        property.downcase!
-        pos += 1 # Skip :
-        # Skip leading whitespace in value
-        while pos < end_pos && whitespace?(@css.getbyte(pos))
-          pos += 1
-        end
-        # Parse value (read until ';' or '}')
-        val_start = pos
-        while pos < end_pos && @css.getbyte(pos) != BYTE_SEMICOLON && @css.getbyte(pos) != BYTE_RBRACE
-          pos += 1
-        end
-        val_end = pos
-        # Trim trailing whitespace from value
-        while val_end > val_start && whitespace?(@css.getbyte(val_end - 1))
-          val_end -= 1
-        end
-        value = byteslice_encoded(val_start, val_end - val_start)
-        # Check for !important flag
-        important = false
-        if value.end_with?('!important')
-          important = true
-          # NOTE: Using rstrip here instead of manual byte loop since !important is rare (not hot path)
-          value = value[0, value.length - 10].rstrip # Remove '!important' and trailing whitespace
-        end
-        pos += 1 if pos < end_pos && @css.getbyte(pos) == BYTE_SEMICOLON
-        # Create declaration
-        declarations << Declaration.new(property, value, important) if prop_end > prop_start && val_end > val_start
+        # This is a declaration - parse it using shared helper
+        decl, pos = parse_single_declaration(pos, end_pos, true)
+        declarations << decl if decl
       end
       declarations
@@ -553,20 +627,44 @@ module Cataract
           next
         end
-        property = byteslice_encoded(property_start, @pos - property_start, encoding: 'US-ASCII')
+        # Extract property name - use UTF-8 encoding to support custom properties with Unicode
+        property = byteslice_encoded(property_start, @pos - property_start)
         property.strip!
-        property.downcase!
+        # Custom properties (--foo) are case-sensitive and can contain Unicode
+        # Regular properties are ASCII-only and case-insensitive
+        unless property.bytesize >= 2 && property.getbyte(0) == BYTE_HYPHEN && property.getbyte(1) == BYTE_HYPHEN
+          # Regular property: force ASCII encoding and downcase
+          property.force_encoding('US-ASCII')
+          property.downcase!
+        end
         @pos += 1 # skip ':'
         skip_ws_and_comments
-        # Parse value (read until ';' or '}')
+        # Parse value (read until ';' or '}', but respect quoted strings)
         value_start = @pos
         important = false
+        in_quote = nil # nil, BYTE_SQUOTE, or BYTE_DQUOTE
         until eof?
           byte = peek_byte
-          break if byte == BYTE_SEMICOLON || byte == BYTE_RBRACE
+          if in_quote
+            # Inside quoted string - only exit on matching quote
+            if byte == in_quote
+              in_quote = nil
+            elsif byte == BYTE_BACKSLASH && @pos + 1 < @len
+              # Skip escaped character
+              @pos += 1
+            end
+          else
+            # Not in quote - check for terminators or quote start
+            break if byte == BYTE_SEMICOLON || byte == BYTE_RBRACE
+            if byte == BYTE_SQUOTE || byte == BYTE_DQUOTE
+              in_quote = byte
+            end
+          end
           @pos += 1
         end
@@ -587,7 +685,7 @@ module Cataract
           end
           # Check for 'important' (9 chars)
-          if i >= 8 && value[(i - 8)..i] == 'important'
+          if i >= 8 && value[(i - 8), 9] == 'important'
             i -= 9
             # Skip whitespace before 'important'
             while i >= 0
@@ -644,16 +742,8 @@ module Cataract
         charset_value = byteslice_encoded(value_start, @pos - value_start)
         charset_value.strip!
-        # Remove quotes (byte-by-byte)
-        result = String.new
-        i = 0
-        len = charset_value.bytesize
-        while i < len
-          byte = charset_value.getbyte(i)
-          result << charset_value[i] unless byte == BYTE_DQUOTE || byte == BYTE_SQUOTE
-          i += 1
-        end
-        @charset = result
+        # Remove quotes
+        @charset = charset_value.delete('"\'')
         @pos += 1 if peek_byte == BYTE_SEMICOLON # consume semicolon
         return
@@ -702,11 +792,24 @@ module Cataract
         # Recursively parse block content (preserve parent media context)
         nested_parser = Parser.new(
           byteslice_encoded(block_start, block_end - block_start),
-          parent_media_sym: @parent_media_sym, depth: @depth + 1
+          parser_options: @parser_options,
+          parent_media_sym: @parent_media_sym,
+          depth: @depth + 1
         )
         nested_result = nested_parser.parse
+        # Merge nested selector_lists with offsetted IDs
+        list_id_offset = @_next_selector_list_id
+        if nested_result[:_selector_lists] && !nested_result[:_selector_lists].empty?
+          nested_result[:_selector_lists].each do |list_id, rule_ids|
+            new_list_id = list_id + list_id_offset
+            offsetted_rule_ids = rule_ids.map { |rid| rid + @rule_id_counter }
+            @_selector_lists[new_list_id] = offsetted_rule_ids
+          end
+          @_next_selector_list_id = list_id_offset + nested_result[:_selector_lists].size
+        end
         # Merge nested media_index into ours
         nested_result[:_media_index].each do |media, rule_ids|
           @_media_index[media] ||= []
@@ -717,6 +820,10 @@ module Cataract
         # Add nested rules to main rules array
         nested_result[:rules].each do |rule|
           rule.id = @rule_id_counter
+          # Update selector_list_id if applicable
+          if rule.is_a?(Rule) && rule.selector_list_id
+            rule.selector_list_id += list_id_offset
+          end
           @rule_id_counter += 1
           @rules << rule
         end
@@ -776,12 +883,24 @@ module Cataract
         # Parse the content with the combined media context
         nested_parser = Parser.new(
           byteslice_encoded(block_start, block_end - block_start),
+          parser_options: @parser_options,
           parent_media_sym: combined_media_sym,
           depth: @depth + 1
         )
         nested_result = nested_parser.parse
+        # Merge nested selector_lists with offsetted IDs
+        list_id_offset = @_next_selector_list_id
+        if nested_result[:_selector_lists] && !nested_result[:_selector_lists].empty?
+          nested_result[:_selector_lists].each do |list_id, rule_ids|
+            new_list_id = list_id + list_id_offset
+            offsetted_rule_ids = rule_ids.map { |rid| rid + @rule_id_counter }
+            @_selector_lists[new_list_id] = offsetted_rule_ids
+          end
+          @_next_selector_list_id = list_id_offset + nested_result[:_selector_lists].size
+        end
         # Merge nested media_index into ours (for nested @media)
         nested_result[:_media_index].each do |media, rule_ids|
           @_media_index[media] ||= []
@@ -792,6 +911,10 @@ module Cataract
         # Add nested rules to main rules array and update media_index
         nested_result[:rules].each do |rule|
           rule.id = @rule_id_counter
+          # Update selector_list_id if applicable
+          if rule.is_a?(Rule) && rule.selector_list_id
+            rule.selector_list_id += list_id_offset
+          end
           # Extract media types and add to each first (if different from full query)
           # We add these BEFORE the full query so that when iterating the media_index hash,
@@ -856,7 +979,11 @@ module Cataract
         # Parse keyframe blocks as rules (0%/from/to etc)
         # Create a nested parser context
-        nested_parser = Parser.new(byteslice_encoded(block_start, block_end - block_start), depth: @depth + 1)
+        nested_parser = Parser.new(
+          byteslice_encoded(block_start, block_end - block_start),
+          parser_options: @parser_options,
+          depth: @depth + 1
+        )
         nested_result = nested_parser.parse
         content = nested_result[:rules]
@@ -1096,7 +1223,7 @@ module Cataract
         result = String.new
         result << parent_selector
         result << ' '
-        result << nested_selector.byteslice(start_pos..-1)
+        result << nested_selector.byteslice(start_pos, nested_selector.bytesize - start_pos)
         [result, nesting_style]
       end
@@ -1120,7 +1247,8 @@ module Cataract
       # If child is a condition (contains ':'), wrap it in parentheses
       combined += if child_str.include?(':')
                     # Add parens if not already present
-                    if child_str.start_with?('(') && child_str.end_with?(')')
+                    len = child_str.bytesize
+                    if len > 1 && child_str.getbyte(0) == BYTE_LPAREN && child_str.getbyte(len - 1) == BYTE_RPAREN
                       child_str
                     else
                       "(#{child_str})"
@@ -1282,56 +1410,9 @@ module Cataract
         end
         break if pos >= end_pos
-        # Parse property name (read until ':')
-        prop_start = pos
-        while pos < end_pos && @css.getbyte(pos) != BYTE_COLON && @css.getbyte(pos) != BYTE_SEMICOLON && @css.getbyte(pos) != BYTE_RBRACE
-          pos += 1
-        end
-        # Skip if no colon found (malformed)
-        if pos >= end_pos || @css.getbyte(pos) != BYTE_COLON
-          # Try to recover by finding next semicolon
-          while pos < end_pos && @css.getbyte(pos) != BYTE_SEMICOLON
-            pos += 1
-          end
-          pos += 1 if pos < end_pos && @css.getbyte(pos) == BYTE_SEMICOLON
-          next
-        end
-        prop_end = pos
-        # Trim trailing whitespace from property
-        while prop_end > prop_start && whitespace?(@css.getbyte(prop_end - 1))
-          prop_end -= 1
-        end
-        property = byteslice_encoded(prop_start, prop_end - prop_start, encoding: 'US-ASCII')
-        property.downcase!
-        pos += 1 # Skip ':'
-        # Skip leading whitespace in value
-        while pos < end_pos && whitespace?(@css.getbyte(pos))
-          pos += 1
-        end
-        # Parse value (read until ';' or '}')
-        val_start = pos
-        while pos < end_pos && @css.getbyte(pos) != BYTE_SEMICOLON && @css.getbyte(pos) != BYTE_RBRACE
-          pos += 1
-        end
-        val_end = pos
-        # Trim trailing whitespace from value
-        while val_end > val_start && whitespace?(@css.getbyte(val_end - 1))
-          val_end -= 1
-        end
-        value = byteslice_encoded(val_start, val_end - val_start)
-        pos += 1 if pos < end_pos && @css.getbyte(pos) == BYTE_SEMICOLON
-        # Create Declaration struct (at-rules don't use !important)
-        declarations << Declaration.new(property, value, false)
+        # Parse declaration using shared helper (at-rules don't use !important)
+        decl, pos = parse_single_declaration(pos, end_pos, false)
+        declarations << decl if decl
       end
       declarations