RubyGems - parselly - Versions diffs - 1.2.0 → 1.3.0 - Mend

parselly 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: e1e4e245059433130b385388d399fe29355a1c679a15327a02cb3b49e69ae23b
-  data.tar.gz: ceb1167e8a32c25543b96988f754f76eb01fe287fc279a6ceb8dd26ffc258ca9
+  metadata.gz: 5bc554d1e8b9c0bba096e513d21aa054f35589a7e52f1c0e2108c0639ec1027b
+  data.tar.gz: 55f3bec1107b38b70bdd4995375fe8cab816036fe86d6cf27e96566689cce03f
 SHA512:
-  metadata.gz: 33dab2f628019bbd51d482c53ae267f98914a5a561beb146ce57b3625f30535b661de168b2c6150faa79f16af8a49a8c15e7d6047a118577359fdd9334249a7f
-  data.tar.gz: 38803e8cc427a8eaa0b2a63f9446cf91334a368a7ea86908da80eb62cca3569a7b44f4c9536165a57f99048cf604dd948814b12dff25ac49c43ab637ec344324
+  metadata.gz: 5dc070854c73e51ac4e21dde9910cd687c788f2bdfe99c9c22870d2e73222dcfc9679cf26e54e9c81032284a201b840e0af395958347f3e0bc10967294f990d0
+  data.tar.gz: 37a0b28ade0b3d74a062fee148d18d7448f200ac04bbb107bdf036a92808b7dc5b803a179dbb4363100fb290351d14e504651a07ac74b9dd68829e5c36af3a28

data/README.md CHANGED Viewed

@@ -1,30 +1,77 @@
 # Parselly [![Gem Version](https://badge.fury.io/rb/parselly.svg)](https://badge.fury.io/rb/parselly) [![CI](https://github.com/ydah/parselly/actions/workflows/test.yml/badge.svg)](https://github.com/ydah/parselly/actions/workflows/test.yml)
-Parselly is a module providing a simple way to parse and extract data from a css selector.
+Pure Ruby CSS selector parser.
 ## Installation
-Add this line to your application's Gemfile:
 ```ruby
 gem 'parselly'
 ```
-And then execute:
 ```bash
 bundle install
 ```
-Or install it yourself as:
+Or install it directly:
 ```bash
 gem install parselly
 ```
-## Development
+Requires Ruby 2.7 or newer.
+## Usage
+```ruby
+require 'parselly'
+ast = Parselly.parse('article#main.content[data-state="open"] > a:hover')
+ast.ids
+#=> ["main"]
+ast.attributes
+#=> [{ name: "data-state", operator: "=", value: "open" }]
+ast.pseudo_class_names
+#=> ["hover"]
+ast.specificity
+#=> [1, 3, 2]
+```
+Strict parsing raises `Parselly::LexError` or `Parselly::SyntaxError` for invalid selectors:
+```ruby
+Parselly.parse('div >')
+```
+Use tolerant mode when you want a `Parselly::ParseResult` instead:
+```ruby
+result = Parselly.parse('div >', tolerant: true)
+result.success?
+#=> false
+result.errors.first[:message]
+#=> "Parse error: unexpected $end '' at 1:6"
+```
-After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
+Use `Parselly.sanitize` to escape text for a CSS identifier:
-To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
+```ruby
+Parselly.sanitize('1st item')
+#=> "\\31 st\\ item"
+```
+## Development
+```bash
+bin/setup
+bundle exec rake
+```
-## Contributing
+## License
-Bug reports and pull requests are welcome on GitHub at https://github.com/ydah/parselly.
+MIT

data/Rakefile CHANGED Viewed

@@ -7,6 +7,16 @@ namespace 'build' do
   task :parser do
     sh 'bundle exec racc parser.y --embedded --frozen -o lib/parselly/parser.rb -t --log-file=parser.output'
   end
+  desc 'verify generated parser files are in sync'
+  task check_parser: :parser do
+    sh 'git diff --exit-code lib/parselly/parser.rb parser.output'
+  end
+end
+desc 'run parser benchmarks'
+task :benchmark do
+  ruby 'benchmark/parser_benchmark.rb'
 end
 require 'rspec/core/rake_task'

data/lib/parselly/lexer.rb CHANGED Viewed

@@ -5,12 +5,76 @@ require 'strscan'
 module Parselly
   class Lexer
     Identifier = Struct.new(:value, :raw) do
+      attr_accessor :position
       def to_s
         value
       end
+      def ==(other)
+        other.respond_to?(:value) ? value == other.value : value == other
+      end
+    end
+    TokenValue = Struct.new(:value, :raw, :position, :quote, keyword_init: true) do
+      def to_s
+        value.to_s
+      end
+      def ==(other)
+        other.respond_to?(:value) ? value == other.value : value == other
+      end
+    end
+    Token = Struct.new(:type, :value, :position, keyword_init: true) do
+      def [](index)
+        to_ary[index]
+      end
+      def []=(index, new_value)
+        case index
+        when 0
+          self.type = new_value
+        when 1
+          self.value = new_value
+        when 2
+          self.position = new_value
+        else
+          raise IndexError, "index #{index} outside of token"
+        end
+      end
+      def first
+        type
+      end
+      def last
+        position
+      end
+      def to_ary
+        [type, value, position]
+      end
+      alias to_a to_ary
+      def ==(other)
+        return super unless other.respond_to?(:to_ary)
+        other_type, other_value, other_position = other.to_ary
+        return false unless type == other_type
+        return false unless value == other_value
+        return position == other_position unless position.is_a?(Hash) && other_position.is_a?(Hash)
+        other_position.all? { |key, expected| position[key] == expected }
+      end
     end
     TOKENS = {
+      # Namespace and column combinators
+      '|' => :PIPE,
+      '||' => :COLUMN,
       # Combinators
       '>' => :CHILD,
       '+' => :ADJACENT,
@@ -37,27 +101,42 @@ module Parselly
       '*=' => :SUBSTRINGMATCH
     }.freeze
-    # Pre-compiled regular expressions for better performance
-    MULTI_CHAR_OPERATORS = [
-      [/~=/, :INCLUDES],
-      [/\|=/, :DASHMATCH],
-      [/\^=/, :PREFIXMATCH],
-      [/\$=/, :SUFFIXMATCH],
-      [/\*=/, :SUBSTRINGMATCH]
-    ].freeze
-    SINGLE_CHAR_OPERATOR_REGEX = /[>+~\[\]():,.#*=-]/.freeze
-    WHITESPACE_REGEX = /[ \t\n\r]+/.freeze
-    STRING_DOUBLE_REGEX = /"([^"\\]|\\.)*"/.freeze
-    STRING_SINGLE_REGEX = /'([^'\\]|\\.)*'/.freeze
-    IDENTIFIER_REGEX = /(?:--|-?[a-zA-Z_])(?:[\w-]|\\[^\n\r\f])*/.freeze
+    MULTI_CHAR_TOKENS = {
+      '~=' => :INCLUDES,
+      '|=' => :DASHMATCH,
+      '^=' => :PREFIXMATCH,
+      '$=' => :SUFFIXMATCH,
+      '*=' => :SUBSTRINGMATCH,
+      '||' => :COLUMN
+    }.freeze
+    SINGLE_CHAR_OPERATOR_REGEX = /[|>+~\[\]():,.#*=-]/.freeze
+    WHITESPACE_REGEX = /[ \t\n\r\f]+/.freeze
+    COMMENT_REGEX = %r{/\*[^*]*\*+(?:[^/*][^*]*\*+)*/}.freeze
+    ESCAPE_SEQUENCE = /\\(?:[0-9a-fA-F]{1,6}[ \t\n\r\f]?|[^\n\r\f])/.freeze
+    IDENTIFIER_REGEX = /
+      (?:
+        --
+        |
+        -?(?:[a-zA-Z_]|[^\x00-\x7F]|#{ESCAPE_SEQUENCE})
+      )
+      (?:[a-zA-Z0-9_-]|[^\x00-\x7F]|#{ESCAPE_SEQUENCE})*
+    /x.freeze
     NUMBER_REGEX = /\d+(\.\d+)?/.freeze
-    ESCAPE_REGEX = /\\(.)/.freeze
+    HEX_ESCAPE_REGEX = /\\([0-9a-fA-F]{1,6})([ \t\n\r\f])?/.freeze
+    ESCAPED_NEWLINE_REGEX = /\\(?:\r\n|[\n\r\f])/.freeze
+    SIMPLE_ESCAPE_REGEX = /\\([^\n\r\f])/.freeze
+    REPLACEMENT_CHARACTER = "\uFFFD"
     attr_reader :line, :column
     def initialize(input)
-      @scanner = StringScanner.new(input)
+      unless input.valid_encoding?
+        raise_lexer_error('Invalid input encoding', { line: 1, column: 1, offset: 0 })
+      end
+      preprocessed_input, @offset_map = preprocess_input(input)
+      @scanner = StringScanner.new(preprocessed_input)
       @line = 1
       @column = 1
       @tokens = []
@@ -65,54 +144,99 @@ module Parselly
     def tokenize
       until @scanner.eos?
-        skip_whitespace
+        skip_ignored
         break if @scanner.eos?
-        pos = { line: @line, column: @column, offset: @scanner.pos }
+        start_position = current_position
-        if (token = scan_string)
-          @tokens << [:STRING, token, pos]
-        elsif (token = scan_number)
-          @tokens << [:NUMBER, token, pos]
-        elsif (token = scan_operator)
-          @tokens << [token, @scanner.matched, pos]
-        elsif (token = scan_identifier)
-          @tokens << [:IDENT, token, pos]
+        if (token = scan_string(start_position))
+          type, value = token
+          @tokens << build_token(type, value, start_position)
+        elsif (value = scan_number)
+          @tokens << build_token(:NUMBER, value, start_position)
+        elsif (type = scan_operator)
+          @tokens << build_token(type, @scanner.matched, start_position)
+        elsif (value = scan_identifier(start_position))
+          @tokens << build_token(:IDENT, value, start_position)
         else
           char = @scanner.getch
-          raise "Unexpected character: #{char} at #{pos[:line]}:#{pos[:column]} (offset #{pos[:offset]})"
+          raise_lexer_error("Unexpected character: #{char}", start_position)
         end
       end
-      @tokens << [false, nil, { line: @line, column: @column, offset: @scanner.pos }]
+      @tokens << Token.new(type: false, value: nil, position: eof_position)
       @tokens
     end
     private
-    def skip_whitespace
-      while @scanner.scan(WHITESPACE_REGEX)
-        matched = @scanner.matched
-        newline_count = matched.count("\n")
-        if newline_count > 0
-          @line += newline_count
-          @column = matched.size - matched.rindex("\n")
+    def preprocess_input(input)
+      output = +''
+      offset_map = { 0 => 0 }
+      chars = input.each_char.to_a
+      original_offset = 0
+      index = 0
+      while index < chars.length
+        char = chars[index]
+        original_start = original_offset
+        original_offset += char.bytesize
+        if char == "\r"
+          if chars[index + 1] == "\n"
+            index += 1
+            original_offset += chars[index].bytesize
+          end
+          append_preprocessed(output, offset_map, "\n", original_start, original_offset)
+        elsif char == "\f"
+          append_preprocessed(output, offset_map, "\n", original_start, original_offset)
+        elsif char == "\0" || surrogate_codepoint?(char)
+          append_preprocessed(output, offset_map, REPLACEMENT_CHARACTER, original_start, original_offset)
         else
-          @column += matched.size
+          append_preprocessed(output, offset_map, char, original_start, original_offset)
         end
+        index += 1
       end
+      offset_map[output.bytesize] = original_offset
+      [output, offset_map]
     end
-    def scan_operator
-      # Check multi-character operators first
-      MULTI_CHAR_OPERATORS.each do |regex, token|
-        if @scanner.scan(regex)
+    def append_preprocessed(output, offset_map, value, original_start, original_end)
+      offset_map[output.bytesize] = original_start
+      output << value
+      offset_map[output.bytesize] = original_end
+    end
+    def surrogate_codepoint?(char)
+      char.ord.between?(0xD800, 0xDFFF)
+    end
+    def skip_ignored
+      loop do
+        if @scanner.scan(WHITESPACE_REGEX)
+          update_position(@scanner.matched)
+        elsif @scanner.peek(2) == '/*'
+          pos = { line: @line, column: @column, offset: @scanner.pos }
+          unless @scanner.scan(COMMENT_REGEX)
+            raise_lexer_error('Unterminated comment', pos)
+          end
           update_position(@scanner.matched)
-          return token
+        else
+          break
         end
       end
+    end
+    def scan_operator
+      two_chars = @scanner.peek(2)
+      if (token = MULTI_CHAR_TOKENS[two_chars])
+        @scanner.pos += 2
+        update_position(two_chars)
+        return token
+      end
-      # Single character operators
       return unless @scanner.scan(SINGLE_CHAR_OPERATOR_REGEX)
       char = @scanner.matched
@@ -120,25 +244,26 @@ module Parselly
       TOKENS[char]
     end
-    # NOTE: Unlike identifiers (where backslash escapes are processed),
-    # escape sequences inside strings (e.g., \n, \", \', \\) are NOT processed.
-    # The raw string content is returned as-is after removing outer quotes.
-    # This is a known limitation for attribute values, as strings are treated
-    # as raw text for simplicity. Identifiers process escapes to support patterns
-    # like .hover\:bg-blue-500, but strings in attributes don't require this.
-    def scan_string
-      if @scanner.scan(STRING_DOUBLE_REGEX)
-        str = @scanner.matched
-        update_position(str)
-        str[1..-2] # Remove quotes
-      elsif @scanner.scan(STRING_SINGLE_REGEX)
-        str = @scanner.matched
-        update_position(str)
-        str[1..-2] # Remove quotes
+    def scan_string(position)
+      quote = @scanner.peek(1)
+      return unless quote == '"' || quote == "'"
+      @scanner.getch
+      update_position(quote)
+      raw = +''
+      until @scanner.eos?
+        char = @scanner.peek(1)
+        return build_string_token(:STRING, raw, position, quote) if char == quote && consume_string_char(raw)
+        return build_string_token(:BAD_STRING, raw, position, quote) if newline?(char)
+        consume_string_char(raw)
       end
+      build_string_token(:STRING, raw, position, quote)
     end
-    def scan_identifier
+    def scan_identifier(position)
       # Match identifiers with optional escape sequences
       # CSS allows \<any-char> as escape in identifiers (e.g., .hover\:bg-blue-500)
       #
@@ -150,9 +275,7 @@ module Parselly
       ident = @scanner.matched
       update_position(ident)
-      # Remove backslashes from escaped characters
-      normalized = ident.gsub(ESCAPE_REGEX, '\1')
-      Identifier.new(normalized, ident)
+      Identifier.new(unescape_css(ident), ident).tap { |identifier| identifier.position = position }
     end
     def scan_number
@@ -163,15 +286,102 @@ module Parselly
       num
     end
+    def consume_string_char(raw)
+      char = @scanner.getch
+      update_position(char)
+      return true if char == '"' || char == "'"
+      raw << char
+      return true unless char == '\\'
+      return true if @scanner.eos?
+      escaped = @scanner.getch
+      update_position(escaped)
+      raw << escaped
+      true
+    end
+    def build_string_token(type, raw, position, quote)
+      [type, TokenValue.new(value: unescape_css(raw), raw: raw, position: position, quote: quote)]
+    end
+    def newline?(char)
+      char == "\n" || char == "\r" || char == "\f"
+    end
     def update_position(text)
-      text.each_char do |char|
-        if char == "\n"
-          @line += 1
-          @column = 1
-        else
-          @column += 1
-        end
+      unless text.match?(/[\n\r\f]/)
+        @column += text.each_char.count
+        return
       end
+      lines = text.split(/\r\n|[\n\r\f]/, -1)
+      @line += lines.length - 1
+      @column = lines.last.each_char.count + 1
+    end
+    def current_position
+      { line: @line, column: @column, offset: original_offset(@scanner.pos) }
+    end
+    def original_offset(preprocessed_offset)
+      @offset_map.fetch(preprocessed_offset, preprocessed_offset)
+    end
+    def build_token(type, value, start_position)
+      position = start_position.merge(
+        start_line: start_position[:line],
+        start_column: start_position[:column],
+        start_offset: start_position[:offset],
+        end_line: @line,
+        end_column: @column,
+        end_offset: original_offset(@scanner.pos)
+      )
+      value.position = position if value.respond_to?(:position=)
+      Token.new(type: type, value: value, position: position)
+    end
+    def eof_position
+      current_position.merge(
+        start_line: @line,
+        start_column: @column,
+        start_offset: original_offset(@scanner.pos),
+        end_line: @line,
+        end_column: @column,
+        end_offset: original_offset(@scanner.pos)
+      )
+    end
+    def unescape_css(value)
+      value
+        .gsub(ESCAPED_NEWLINE_REGEX, '')
+        .gsub(HEX_ESCAPE_REGEX) { decode_hex_escape(Regexp.last_match(1)) }
+        .gsub(SIMPLE_ESCAPE_REGEX, '\1')
+    end
+    def decode_hex_escape(hex)
+      codepoint = hex.to_i(16)
+      return REPLACEMENT_CHARACTER if codepoint.zero? || codepoint > 0x10FFFF
+      codepoint.chr(Encoding::UTF_8)
+    rescue RangeError
+      REPLACEMENT_CHARACTER
+    end
+    def raise_lexer_error(message, position)
+      error = {
+        message: "#{message} at #{position[:line]}:#{position[:column]} (offset #{position[:offset]})",
+        line: position[:line],
+        column: position[:column],
+        offset: position[:offset]
+      }
+      if defined?(Parselly::LexError)
+        raise Parselly::LexError, error
+      end
+      raise error[:message]
     end
   end
 end