parselly 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c245172165bcac6e4b24a355b6e26a4039960fb8654374587523c3041015bb96
4
- data.tar.gz: 1897eee14cb66e216422815883375837168e8850d17e87a98ca7a05873d18d58
3
+ metadata.gz: 5bc554d1e8b9c0bba096e513d21aa054f35589a7e52f1c0e2108c0639ec1027b
4
+ data.tar.gz: 55f3bec1107b38b70bdd4995375fe8cab816036fe86d6cf27e96566689cce03f
5
5
  SHA512:
6
- metadata.gz: 011ea12078d3311c28d00864167fa5cd7a5a9b1afd24feacccb4df2631b00e095aedf231793bd93bf7717b2bf99b7bfd28a6918d7b5d0f3e2794ac3d5f0faa6b
7
- data.tar.gz: 261006f641a09ecea004423a68601b0a7c4941d35cf13ddd0acd74dfc530a1de19a2e4d5d370cef8a366271a2a713756ba6ce3f7fd1b1fe8fbe464e534a3549a
6
+ metadata.gz: 5dc070854c73e51ac4e21dde9910cd687c788f2bdfe99c9c22870d2e73222dcfc9679cf26e54e9c81032284a201b840e0af395958347f3e0bc10967294f990d0
7
+ data.tar.gz: 37a0b28ade0b3d74a062fee148d18d7448f200ac04bbb107bdf036a92808b7dc5b803a179dbb4363100fb290351d14e504651a07ac74b9dd68829e5c36af3a28
data/README.md CHANGED
@@ -1,30 +1,77 @@
1
1
  # Parselly [![Gem Version](https://badge.fury.io/rb/parselly.svg)](https://badge.fury.io/rb/parselly) [![CI](https://github.com/ydah/parselly/actions/workflows/test.yml/badge.svg)](https://github.com/ydah/parselly/actions/workflows/test.yml)
2
2
 
3
- Parselly is a module providing a simple way to parse and extract data from a css selector.
3
+ Pure Ruby CSS selector parser.
4
4
 
5
5
  ## Installation
6
6
 
7
- Add this line to your application's Gemfile:
8
7
  ```ruby
9
8
  gem 'parselly'
10
9
  ```
11
10
 
12
- And then execute:
13
11
  ```bash
14
12
  bundle install
15
13
  ```
16
14
 
17
- Or install it yourself as:
15
+ Or install it directly:
16
+
18
17
  ```bash
19
18
  gem install parselly
20
19
  ```
21
20
 
22
- ## Development
21
+ Requires Ruby 2.7 or newer.
22
+
23
+ ## Usage
24
+
25
+ ```ruby
26
+ require 'parselly'
27
+
28
+ ast = Parselly.parse('article#main.content[data-state="open"] > a:hover')
29
+
30
+ ast.ids
31
+ #=> ["main"]
32
+
33
+ ast.attributes
34
+ #=> [{ name: "data-state", operator: "=", value: "open" }]
35
+
36
+ ast.pseudo_class_names
37
+ #=> ["hover"]
38
+
39
+ ast.specificity
40
+ #=> [1, 3, 2]
41
+ ```
42
+
43
+ Strict parsing raises `Parselly::LexError` or `Parselly::SyntaxError` for invalid selectors:
44
+
45
+ ```ruby
46
+ Parselly.parse('div >')
47
+ ```
48
+
49
+ Use tolerant mode when you want a `Parselly::ParseResult` instead:
50
+
51
+ ```ruby
52
+ result = Parselly.parse('div >', tolerant: true)
53
+
54
+ result.success?
55
+ #=> false
56
+
57
+ result.errors.first[:message]
58
+ #=> "Parse error: unexpected $end '' at 1:6"
59
+ ```
23
60
 
24
- After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
61
+ Use `Parselly.sanitize` to escape text for a CSS identifier:
25
62
 
26
- To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
63
+ ```ruby
64
+ Parselly.sanitize('1st item')
65
+ #=> "\\31 st\\ item"
66
+ ```
67
+
68
+ ## Development
69
+
70
+ ```bash
71
+ bin/setup
72
+ bundle exec rake
73
+ ```
27
74
 
28
- ## Contributing
75
+ ## License
29
76
 
30
- Bug reports and pull requests are welcome on GitHub at https://github.com/ydah/parselly.
77
+ MIT
data/Rakefile CHANGED
@@ -7,6 +7,16 @@ namespace 'build' do
7
7
  task :parser do
8
8
  sh 'bundle exec racc parser.y --embedded --frozen -o lib/parselly/parser.rb -t --log-file=parser.output'
9
9
  end
10
+
11
+ desc 'verify generated parser files are in sync'
12
+ task check_parser: :parser do
13
+ sh 'git diff --exit-code lib/parselly/parser.rb parser.output'
14
+ end
15
+ end
16
+
17
+ desc 'run parser benchmarks'
18
+ task :benchmark do
19
+ ruby 'benchmark/parser_benchmark.rb'
10
20
  end
11
21
 
12
22
  require 'rspec/core/rake_task'
@@ -4,7 +4,77 @@ require 'strscan'
4
4
 
5
5
  module Parselly
6
6
  class Lexer
7
+ Identifier = Struct.new(:value, :raw) do
8
+ attr_accessor :position
9
+
10
+ def to_s
11
+ value
12
+ end
13
+
14
+ def ==(other)
15
+ other.respond_to?(:value) ? value == other.value : value == other
16
+ end
17
+ end
18
+
19
+ TokenValue = Struct.new(:value, :raw, :position, :quote, keyword_init: true) do
20
+ def to_s
21
+ value.to_s
22
+ end
23
+
24
+ def ==(other)
25
+ other.respond_to?(:value) ? value == other.value : value == other
26
+ end
27
+ end
28
+
29
+ Token = Struct.new(:type, :value, :position, keyword_init: true) do
30
+ def [](index)
31
+ to_ary[index]
32
+ end
33
+
34
+ def []=(index, new_value)
35
+ case index
36
+ when 0
37
+ self.type = new_value
38
+ when 1
39
+ self.value = new_value
40
+ when 2
41
+ self.position = new_value
42
+ else
43
+ raise IndexError, "index #{index} outside of token"
44
+ end
45
+ end
46
+
47
+ def first
48
+ type
49
+ end
50
+
51
+ def last
52
+ position
53
+ end
54
+
55
+ def to_ary
56
+ [type, value, position]
57
+ end
58
+
59
+ alias to_a to_ary
60
+
61
+ def ==(other)
62
+ return super unless other.respond_to?(:to_ary)
63
+
64
+ other_type, other_value, other_position = other.to_ary
65
+ return false unless type == other_type
66
+ return false unless value == other_value
67
+ return position == other_position unless position.is_a?(Hash) && other_position.is_a?(Hash)
68
+
69
+ other_position.all? { |key, expected| position[key] == expected }
70
+ end
71
+ end
72
+
7
73
  TOKENS = {
74
+ # Namespace and column combinators
75
+ '|' => :PIPE,
76
+ '||' => :COLUMN,
77
+
8
78
  # Combinators
9
79
  '>' => :CHILD,
10
80
  '+' => :ADJACENT,
@@ -31,27 +101,42 @@ module Parselly
31
101
  '*=' => :SUBSTRINGMATCH
32
102
  }.freeze
33
103
 
34
- # Pre-compiled regular expressions for better performance
35
- MULTI_CHAR_OPERATORS = [
36
- [/~=/, :INCLUDES],
37
- [/\|=/, :DASHMATCH],
38
- [/\^=/, :PREFIXMATCH],
39
- [/\$=/, :SUFFIXMATCH],
40
- [/\*=/, :SUBSTRINGMATCH]
41
- ].freeze
42
-
43
- SINGLE_CHAR_OPERATOR_REGEX = /[>+~\[\]():,.#*=-]/.freeze
44
- WHITESPACE_REGEX = /[ \t\n\r]+/.freeze
45
- STRING_DOUBLE_REGEX = /"([^"\\]|\\.)*"/.freeze
46
- STRING_SINGLE_REGEX = /'([^'\\]|\\.)*'/.freeze
47
- IDENTIFIER_REGEX = /(?:--|-?[a-zA-Z_])(?:[\w-]|\\[^\n\r\f])*/.freeze
104
+ MULTI_CHAR_TOKENS = {
105
+ '~=' => :INCLUDES,
106
+ '|=' => :DASHMATCH,
107
+ '^=' => :PREFIXMATCH,
108
+ '$=' => :SUFFIXMATCH,
109
+ '*=' => :SUBSTRINGMATCH,
110
+ '||' => :COLUMN
111
+ }.freeze
112
+
113
+ SINGLE_CHAR_OPERATOR_REGEX = /[|>+~\[\]():,.#*=-]/.freeze
114
+ WHITESPACE_REGEX = /[ \t\n\r\f]+/.freeze
115
+ COMMENT_REGEX = %r{/\*[^*]*\*+(?:[^/*][^*]*\*+)*/}.freeze
116
+ ESCAPE_SEQUENCE = /\\(?:[0-9a-fA-F]{1,6}[ \t\n\r\f]?|[^\n\r\f])/.freeze
117
+ IDENTIFIER_REGEX = /
118
+ (?:
119
+ --
120
+ |
121
+ -?(?:[a-zA-Z_]|[^\x00-\x7F]|#{ESCAPE_SEQUENCE})
122
+ )
123
+ (?:[a-zA-Z0-9_-]|[^\x00-\x7F]|#{ESCAPE_SEQUENCE})*
124
+ /x.freeze
48
125
  NUMBER_REGEX = /\d+(\.\d+)?/.freeze
49
- ESCAPE_REGEX = /\\(.)/.freeze
126
+ HEX_ESCAPE_REGEX = /\\([0-9a-fA-F]{1,6})([ \t\n\r\f])?/.freeze
127
+ ESCAPED_NEWLINE_REGEX = /\\(?:\r\n|[\n\r\f])/.freeze
128
+ SIMPLE_ESCAPE_REGEX = /\\([^\n\r\f])/.freeze
129
+ REPLACEMENT_CHARACTER = "\uFFFD"
50
130
 
51
131
  attr_reader :line, :column
52
132
 
53
133
  def initialize(input)
54
- @scanner = StringScanner.new(input)
134
+ unless input.valid_encoding?
135
+ raise_lexer_error('Invalid input encoding', { line: 1, column: 1, offset: 0 })
136
+ end
137
+
138
+ preprocessed_input, @offset_map = preprocess_input(input)
139
+ @scanner = StringScanner.new(preprocessed_input)
55
140
  @line = 1
56
141
  @column = 1
57
142
  @tokens = []
@@ -59,54 +144,99 @@ module Parselly
59
144
 
60
145
  def tokenize
61
146
  until @scanner.eos?
62
- skip_whitespace
147
+ skip_ignored
63
148
  break if @scanner.eos?
64
149
 
65
- pos = { line: @line, column: @column }
150
+ start_position = current_position
66
151
 
67
- if (token = scan_string)
68
- @tokens << [:STRING, token, pos]
69
- elsif (token = scan_number)
70
- @tokens << [:NUMBER, token, pos]
71
- elsif (token = scan_operator)
72
- @tokens << [token, @scanner.matched, pos]
73
- elsif (token = scan_identifier)
74
- @tokens << [:IDENT, token, pos]
152
+ if (token = scan_string(start_position))
153
+ type, value = token
154
+ @tokens << build_token(type, value, start_position)
155
+ elsif (value = scan_number)
156
+ @tokens << build_token(:NUMBER, value, start_position)
157
+ elsif (type = scan_operator)
158
+ @tokens << build_token(type, @scanner.matched, start_position)
159
+ elsif (value = scan_identifier(start_position))
160
+ @tokens << build_token(:IDENT, value, start_position)
75
161
  else
76
162
  char = @scanner.getch
77
- raise "Unexpected character: #{char} at #{pos[:line]}:#{pos[:column]}"
163
+ raise_lexer_error("Unexpected character: #{char}", start_position)
78
164
  end
79
165
  end
80
166
 
81
- @tokens << [false, nil, { line: @line, column: @column }]
167
+ @tokens << Token.new(type: false, value: nil, position: eof_position)
82
168
  @tokens
83
169
  end
84
170
 
85
171
  private
86
172
 
87
- def skip_whitespace
88
- while @scanner.scan(WHITESPACE_REGEX)
89
- matched = @scanner.matched
90
- newline_count = matched.count("\n")
91
- if newline_count > 0
92
- @line += newline_count
93
- @column = matched.size - matched.rindex("\n")
173
+ def preprocess_input(input)
174
+ output = +''
175
+ offset_map = { 0 => 0 }
176
+ chars = input.each_char.to_a
177
+ original_offset = 0
178
+ index = 0
179
+
180
+ while index < chars.length
181
+ char = chars[index]
182
+ original_start = original_offset
183
+ original_offset += char.bytesize
184
+
185
+ if char == "\r"
186
+ if chars[index + 1] == "\n"
187
+ index += 1
188
+ original_offset += chars[index].bytesize
189
+ end
190
+ append_preprocessed(output, offset_map, "\n", original_start, original_offset)
191
+ elsif char == "\f"
192
+ append_preprocessed(output, offset_map, "\n", original_start, original_offset)
193
+ elsif char == "\0" || surrogate_codepoint?(char)
194
+ append_preprocessed(output, offset_map, REPLACEMENT_CHARACTER, original_start, original_offset)
94
195
  else
95
- @column += matched.size
196
+ append_preprocessed(output, offset_map, char, original_start, original_offset)
96
197
  end
198
+
199
+ index += 1
97
200
  end
201
+
202
+ offset_map[output.bytesize] = original_offset
203
+ [output, offset_map]
98
204
  end
99
205
 
100
- def scan_operator
101
- # Check multi-character operators first
102
- MULTI_CHAR_OPERATORS.each do |regex, token|
103
- if @scanner.scan(regex)
206
+ def append_preprocessed(output, offset_map, value, original_start, original_end)
207
+ offset_map[output.bytesize] = original_start
208
+ output << value
209
+ offset_map[output.bytesize] = original_end
210
+ end
211
+
212
+ def surrogate_codepoint?(char)
213
+ char.ord.between?(0xD800, 0xDFFF)
214
+ end
215
+
216
+ def skip_ignored
217
+ loop do
218
+ if @scanner.scan(WHITESPACE_REGEX)
219
+ update_position(@scanner.matched)
220
+ elsif @scanner.peek(2) == '/*'
221
+ pos = { line: @line, column: @column, offset: @scanner.pos }
222
+ unless @scanner.scan(COMMENT_REGEX)
223
+ raise_lexer_error('Unterminated comment', pos)
224
+ end
104
225
  update_position(@scanner.matched)
105
- return token
226
+ else
227
+ break
106
228
  end
107
229
  end
230
+ end
231
+
232
+ def scan_operator
233
+ two_chars = @scanner.peek(2)
234
+ if (token = MULTI_CHAR_TOKENS[two_chars])
235
+ @scanner.pos += 2
236
+ update_position(two_chars)
237
+ return token
238
+ end
108
239
 
109
- # Single character operators
110
240
  return unless @scanner.scan(SINGLE_CHAR_OPERATOR_REGEX)
111
241
 
112
242
  char = @scanner.matched
@@ -114,25 +244,26 @@ module Parselly
114
244
  TOKENS[char]
115
245
  end
116
246
 
117
- # NOTE: Unlike identifiers (where backslash escapes are processed),
118
- # escape sequences inside strings (e.g., \n, \", \', \\) are NOT processed.
119
- # The raw string content is returned as-is after removing outer quotes.
120
- # This is a known limitation for attribute values, as strings are treated
121
- # as raw text for simplicity. Identifiers process escapes to support patterns
122
- # like .hover\:bg-blue-500, but strings in attributes don't require this.
123
- def scan_string
124
- if @scanner.scan(STRING_DOUBLE_REGEX)
125
- str = @scanner.matched
126
- update_position(str)
127
- str[1..-2] # Remove quotes
128
- elsif @scanner.scan(STRING_SINGLE_REGEX)
129
- str = @scanner.matched
130
- update_position(str)
131
- str[1..-2] # Remove quotes
247
+ def scan_string(position)
248
+ quote = @scanner.peek(1)
249
+ return unless quote == '"' || quote == "'"
250
+
251
+ @scanner.getch
252
+ update_position(quote)
253
+ raw = +''
254
+
255
+ until @scanner.eos?
256
+ char = @scanner.peek(1)
257
+ return build_string_token(:STRING, raw, position, quote) if char == quote && consume_string_char(raw)
258
+ return build_string_token(:BAD_STRING, raw, position, quote) if newline?(char)
259
+
260
+ consume_string_char(raw)
132
261
  end
262
+
263
+ build_string_token(:STRING, raw, position, quote)
133
264
  end
134
265
 
135
- def scan_identifier
266
+ def scan_identifier(position)
136
267
  # Match identifiers with optional escape sequences
137
268
  # CSS allows \<any-char> as escape in identifiers (e.g., .hover\:bg-blue-500)
138
269
  #
@@ -144,8 +275,7 @@ module Parselly
144
275
 
145
276
  ident = @scanner.matched
146
277
  update_position(ident)
147
- # Remove backslashes from escaped characters
148
- ident.gsub(ESCAPE_REGEX, '\1')
278
+ Identifier.new(unescape_css(ident), ident).tap { |identifier| identifier.position = position }
149
279
  end
150
280
 
151
281
  def scan_number
@@ -156,15 +286,102 @@ module Parselly
156
286
  num
157
287
  end
158
288
 
289
+ def consume_string_char(raw)
290
+ char = @scanner.getch
291
+ update_position(char)
292
+ return true if char == '"' || char == "'"
293
+
294
+ raw << char
295
+ return true unless char == '\\'
296
+ return true if @scanner.eos?
297
+
298
+ escaped = @scanner.getch
299
+ update_position(escaped)
300
+ raw << escaped
301
+ true
302
+ end
303
+
304
+ def build_string_token(type, raw, position, quote)
305
+ [type, TokenValue.new(value: unescape_css(raw), raw: raw, position: position, quote: quote)]
306
+ end
307
+
308
+ def newline?(char)
309
+ char == "\n" || char == "\r" || char == "\f"
310
+ end
311
+
159
312
  def update_position(text)
160
- text.each_char do |char|
161
- if char == "\n"
162
- @line += 1
163
- @column = 1
164
- else
165
- @column += 1
166
- end
313
+ unless text.match?(/[\n\r\f]/)
314
+ @column += text.each_char.count
315
+ return
316
+ end
317
+
318
+ lines = text.split(/\r\n|[\n\r\f]/, -1)
319
+ @line += lines.length - 1
320
+ @column = lines.last.each_char.count + 1
321
+ end
322
+
323
+ def current_position
324
+ { line: @line, column: @column, offset: original_offset(@scanner.pos) }
325
+ end
326
+
327
+ def original_offset(preprocessed_offset)
328
+ @offset_map.fetch(preprocessed_offset, preprocessed_offset)
329
+ end
330
+
331
+ def build_token(type, value, start_position)
332
+ position = start_position.merge(
333
+ start_line: start_position[:line],
334
+ start_column: start_position[:column],
335
+ start_offset: start_position[:offset],
336
+ end_line: @line,
337
+ end_column: @column,
338
+ end_offset: original_offset(@scanner.pos)
339
+ )
340
+
341
+ value.position = position if value.respond_to?(:position=)
342
+ Token.new(type: type, value: value, position: position)
343
+ end
344
+
345
+ def eof_position
346
+ current_position.merge(
347
+ start_line: @line,
348
+ start_column: @column,
349
+ start_offset: original_offset(@scanner.pos),
350
+ end_line: @line,
351
+ end_column: @column,
352
+ end_offset: original_offset(@scanner.pos)
353
+ )
354
+ end
355
+
356
+ def unescape_css(value)
357
+ value
358
+ .gsub(ESCAPED_NEWLINE_REGEX, '')
359
+ .gsub(HEX_ESCAPE_REGEX) { decode_hex_escape(Regexp.last_match(1)) }
360
+ .gsub(SIMPLE_ESCAPE_REGEX, '\1')
361
+ end
362
+
363
+ def decode_hex_escape(hex)
364
+ codepoint = hex.to_i(16)
365
+ return REPLACEMENT_CHARACTER if codepoint.zero? || codepoint > 0x10FFFF
366
+
367
+ codepoint.chr(Encoding::UTF_8)
368
+ rescue RangeError
369
+ REPLACEMENT_CHARACTER
370
+ end
371
+
372
+ def raise_lexer_error(message, position)
373
+ error = {
374
+ message: "#{message} at #{position[:line]}:#{position[:column]} (offset #{position[:offset]})",
375
+ line: position[:line],
376
+ column: position[:column],
377
+ offset: position[:offset]
378
+ }
379
+
380
+ if defined?(Parselly::LexError)
381
+ raise Parselly::LexError, error
167
382
  end
383
+
384
+ raise error[:message]
168
385
  end
169
386
  end
170
387
  end