RubyGems - p_css - Versions diffs - 0.1.2 → 0.1.4 - Mend

p_css 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 772060eec5d726253913cd8be2daa6180429680d014c1752c9b26b15618e4ba8
-  data.tar.gz: edd5e5afc5871362dc21cca89d8eb6a5b085350022f79f2b8c6f07a032f07aaa
+  metadata.gz: 20afa2206ed855fdd796f19179d06a8dc8b76231c223560d59e664cb43ddf897
+  data.tar.gz: e951f89d04ff6db6f68151ad05a414e4f0f7d05f2dfc48580b22ef31b6f949de
 SHA512:
-  metadata.gz: 7b7e331023830f09938bbd03a29f50c49b19b40169226a9c13866c1ce2436b0dcd71987fb183f403b7b98784c68c8e8426e6043dd3757ebe5343e37862e2c228
-  data.tar.gz: c06594e2e861c77e56cc32dcb1980b3fba8bfe0175fa1f108e4b0d02bef929efd59274e5c53f04ecea8245b47c583f41eb8a4396a5689052a569af54e8026f5a
+  metadata.gz: dc533dd2a146654d7a622b3206568168bea1e404b163ef6b24ae1c841ef4cd5ff3a4621bc6794f64178ffd5c87a0c4e6a46f70fd6c7999a52539091849ae2941
+  data.tar.gz: 67d08837559466bc5713aa6a40d8e67030ff9389228527b4ed8b4bb5e1121965fa90a31e6cf5aae40167151e535ae83b9f6073068e72d495a5098559b331698d

data/lib/css/code_points.rb CHANGED Viewed

@@ -1,36 +1,59 @@
 module CSS
   # Character class predicates from CSS Syntax §4.2 Definitions, plus the
   # U+FFFD replacement character used both during tokenization and
-  # serialization. Implemented with char comparisons rather than regex to
-  # avoid pattern-match overhead in the tokenizer's inner loop.
+  # serialization.
+  #
+  # ASCII bytes are looked up in a precomputed boolean table (one Array
+  # access + one branch); non-ASCII code points (>= 0x80) are always
+  # ident-cp / ident-start per spec, so the helpers fall back to a single
+  # `c.ord >= 0x80` check. Avoids the chain of `String#<=>` calls a
+  # range-style predicate would dispatch.
   module CodePoints
     REPLACEMENT = "�".freeze
+    def self.build_table(*ranges_or_ints)
+      Array.new(128, false).tap {|a|
+        ranges_or_ints.each {|r|
+          if r.is_a?(Range) then r.each { a[it] = true }
+          else                   a[r] = true
+          end
+        }
+      }.freeze
+    end
+    DIGIT_TABLE       = build_table(0x30..0x39)
+    HEX_DIGIT_TABLE   = build_table(0x30..0x39, 0x41..0x46, 0x61..0x66)
+    IDENT_START_TABLE = build_table(0x41..0x5A, 0x61..0x7A, 0x5F)
+    IDENT_CP_TABLE    = build_table(0x30..0x39, 0x41..0x5A, 0x61..0x7A, 0x5F, 0x2D)
     module_function
     def digit?(c)
-      !c.nil? && c >= '0' && c <= '9'
+      return false if c.nil?
+      o = c.ord
+      o < 128 && DIGIT_TABLE[o]
     end
     def hex_digit?(c)
       return false if c.nil?
-      (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')
+      o = c.ord
+      o < 128 && HEX_DIGIT_TABLE[o]
     end
     def ident_start_code_point?(c)
       return false if c.nil?
-      return true  if c == '_' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
-      c.ord >= 0x80
+      o = c.ord
+      o >= 128 || IDENT_START_TABLE[o]
     end
     def ident_code_point?(c)
       return false if c.nil?
-      return true  if c == '_' || c == '-' || (c >= '0' && c <= '9')
-      return true  if (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
-      c.ord >= 0x80
+      o = c.ord
+      o >= 128 || IDENT_CP_TABLE[o]
     end
   end
 end

data/lib/css/token.rb CHANGED Viewed

@@ -17,7 +17,7 @@ module CSS
       eof
     ].freeze
-    attr_reader :type, :value, :flag, :unit, :position
+    attr_reader :type, :value, :flag, :unit
     def initialize(type, value = nil, flag: nil, unit: nil, position: nil)
       raise ArgumentError, "unknown token type: #{type.inspect}" unless TYPES.include?(type)
@@ -58,21 +58,50 @@ module CSS
       type == :whitespace || type == :comment
     end
-    # Mutating: assigns the token's source position and returns self. Used
-    # by the tokenizer so each token requires only a single allocation.
-    def assign_position!(pos)
-      @position = pos
+    # Most tokens never have their `position` read after parsing, so the
+    # tokenizer plants raw offsets + a shared `@newlines` reference here
+    # via this method, and `Token#position` materializes the `Position`
+    # Data on first read.
+    def assign_source!(start_offset, end_offset, newlines)
+      @start_offset = start_offset
+      @end_offset   = end_offset
+      @newlines     = newlines
       self
     end
+    # Returns nil for tokens built without source info (i.e. tokens
+    # constructed by hand or via `Token.new(:eof)`).
+    def position
+      return @position if @position
+      return nil unless instance_variable_defined?(:@start_offset)
+      @position = compute_position
+    end
+    # Reads `@position` directly so debug-style introspection doesn't
+    # materialize a `Position` as a side effect.
     def inspect
       parts = ["type=#{type.inspect}"]
       parts << "value=#{value.inspect}" unless value.nil?
       parts << "flag=#{flag.inspect}"   unless flag.nil?
       parts << "unit=#{unit.inspect}"   unless unit.nil?
-      parts << "@#{position}"           unless position.nil?
+      parts << "@#{@position}"          if @position
       "#<CSS::Token #{parts.join(' ')}>"
     end
+    private
+    def compute_position
+      idx     = @newlines.bsearch_index { it >= @start_offset } || @newlines.size
+      prev_nl = idx.zero? ? -1 : @newlines[idx - 1]
+      Position.new(
+        line:       idx + 1,
+        column:     @start_offset - prev_nl,
+        offset:     @start_offset,
+        end_offset: @end_offset
+      )
+    end
   end
 end

data/lib/css/tokenizer.rb CHANGED Viewed

@@ -1,6 +1,9 @@
 module CSS
   # Tokenizer based on CSS Syntax Module Level 3/4 §4.
   # https://www.w3.org/TR/css-syntax-3/#tokenization
+  #
+  # Not thread-safe: an instance carries a mutable cursor (`@pos`) that
+  # advances over the input. Allocate one tokenizer per thread.
   class Tokenizer
     include CodePoints
@@ -21,9 +24,10 @@ module CSS
     PREPROCESS_RE = /\r\n?|\f|\0/.freeze
     def initialize(input, preserve_comments: false)
-      @input             = preprocess(input)
+      @chars             = preprocess(input)
+      @length            = @chars.length
       @pos               = 0
-      @newlines          = collect_newline_offsets(@input)
+      @newlines          = collect_newline_offsets(@chars)
       @preserve_comments = preserve_comments
     end
@@ -43,13 +47,12 @@ module CSS
     def next_token
       consume_comments unless @preserve_comments
-      return Token.new(:eof) if @pos >= @input.length
+      return Token.new(:eof) if @pos >= @length
       start_offset = @pos
       tok          = consume_one_token
-      line, column = line_column_at(start_offset)
-      tok.assign_position!(Position.new(line:, column:, offset: start_offset, end_offset: @pos))
+      tok.assign_source!(start_offset, @pos, @newlines)
     end
     private
@@ -127,18 +130,25 @@ module CSS
       end
     end
+    # Random access on a non-ascii-only UTF-8 String is O(distance from
+    # the cached character index), and the peek-ahead pattern (`peek`,
+    # `peek(1)`, `peek(2)`) defeats the cache — empirically ~200× slower
+    # than indexing a flat Array. Splitting into `chars` once amortizes
+    # the UTF-8 walk and gives us O(1) random access for the rest of
+    # tokenization.
     def preprocess(input)
-      input.encode('UTF-8').gsub(PREPROCESS_RE) {
-        $~[0] == "\0" ? CodePoints::REPLACEMENT : "\n"
-      }
+      input
+        .encode('UTF-8')
+        .gsub(PREPROCESS_RE) { $~[0] == "\0" ? CodePoints::REPLACEMENT : "\n" }
+        .chars
     end
     def peek(offset = 0)
-      @input[@pos + offset]
+      @chars[@pos + offset]
     end
     def consume
-      c = @input[@pos]
+      c = @chars[@pos]
       return nil if c.nil?
       @pos += 1
@@ -149,21 +159,17 @@ module CSS
       @pos -= 1
     end
-    def collect_newline_offsets(input)
+    def collect_newline_offsets(chars)
       offsets = []
-      i       = -1
+      i       = 0
+      n       = chars.length
-      offsets << i while (i = input.index("\n", i + 1))
-      offsets
-    end
-    # Newline characters themselves are reported as belonging to the line
-    # they terminate (col = offset + 1 on line 1, etc).
-    def line_column_at(offset)
-      idx     = @newlines.bsearch_index { it >= offset } || @newlines.size
-      prev_nl = idx.zero? ? -1 : @newlines[idx - 1]
+      while i < n
+        offsets << i if chars[i] == "\n"
+        i += 1
+      end
-      [idx + 1, offset - prev_nl]
+      offsets
     end
     def whitespace?(c)
@@ -242,7 +248,7 @@ module CSS
     end
     def eof?
-      @pos >= @input.length
+      @pos >= @length
     end
     def consume_whitespace

data/lib/css/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module CSS
-  VERSION = '0.1.2'
+  VERSION = '0.1.4'
 end

data/sig/css/token.rbs CHANGED Viewed

@@ -22,7 +22,7 @@ module CSS
     def comment?: () -> bool
     def trivia?: () -> bool
-    def assign_position!: (Position pos) -> self
+    def assign_source!: (Integer start_offset, Integer end_offset, Array[Integer] newlines) -> self
     def ==: (untyped other) -> bool
     def eql?: (untyped other) -> bool

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: p_css
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.1.4
 platform: ruby
 authors:
 - Keita Urashima