parselly 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e1e4e245059433130b385388d399fe29355a1c679a15327a02cb3b49e69ae23b
4
- data.tar.gz: ceb1167e8a32c25543b96988f754f76eb01fe287fc279a6ceb8dd26ffc258ca9
3
+ metadata.gz: 5bc554d1e8b9c0bba096e513d21aa054f35589a7e52f1c0e2108c0639ec1027b
4
+ data.tar.gz: 55f3bec1107b38b70bdd4995375fe8cab816036fe86d6cf27e96566689cce03f
5
5
  SHA512:
6
- metadata.gz: 33dab2f628019bbd51d482c53ae267f98914a5a561beb146ce57b3625f30535b661de168b2c6150faa79f16af8a49a8c15e7d6047a118577359fdd9334249a7f
7
- data.tar.gz: 38803e8cc427a8eaa0b2a63f9446cf91334a368a7ea86908da80eb62cca3569a7b44f4c9536165a57f99048cf604dd948814b12dff25ac49c43ab637ec344324
6
+ metadata.gz: 5dc070854c73e51ac4e21dde9910cd687c788f2bdfe99c9c22870d2e73222dcfc9679cf26e54e9c81032284a201b840e0af395958347f3e0bc10967294f990d0
7
+ data.tar.gz: 37a0b28ade0b3d74a062fee148d18d7448f200ac04bbb107bdf036a92808b7dc5b803a179dbb4363100fb290351d14e504651a07ac74b9dd68829e5c36af3a28
data/README.md CHANGED
@@ -1,30 +1,77 @@
1
1
  # Parselly [![Gem Version](https://badge.fury.io/rb/parselly.svg)](https://badge.fury.io/rb/parselly) [![CI](https://github.com/ydah/parselly/actions/workflows/test.yml/badge.svg)](https://github.com/ydah/parselly/actions/workflows/test.yml)
2
2
 
3
- Parselly is a module providing a simple way to parse and extract data from a css selector.
3
+ Pure Ruby CSS selector parser.
4
4
 
5
5
  ## Installation
6
6
 
7
- Add this line to your application's Gemfile:
8
7
  ```ruby
9
8
  gem 'parselly'
10
9
  ```
11
10
 
12
- And then execute:
13
11
  ```bash
14
12
  bundle install
15
13
  ```
16
14
 
17
- Or install it yourself as:
15
+ Or install it directly:
16
+
18
17
  ```bash
19
18
  gem install parselly
20
19
  ```
21
20
 
22
- ## Development
21
+ Requires Ruby 2.7 or newer.
22
+
23
+ ## Usage
24
+
25
+ ```ruby
26
+ require 'parselly'
27
+
28
+ ast = Parselly.parse('article#main.content[data-state="open"] > a:hover')
29
+
30
+ ast.ids
31
+ #=> ["main"]
32
+
33
+ ast.attributes
34
+ #=> [{ name: "data-state", operator: "=", value: "open" }]
35
+
36
+ ast.pseudo_class_names
37
+ #=> ["hover"]
38
+
39
+ ast.specificity
40
+ #=> [1, 3, 2]
41
+ ```
42
+
43
+ Strict parsing raises `Parselly::LexError` or `Parselly::SyntaxError` for invalid selectors:
44
+
45
+ ```ruby
46
+ Parselly.parse('div >')
47
+ ```
48
+
49
+ Use tolerant mode when you want a `Parselly::ParseResult` instead:
50
+
51
+ ```ruby
52
+ result = Parselly.parse('div >', tolerant: true)
53
+
54
+ result.success?
55
+ #=> false
56
+
57
+ result.errors.first[:message]
58
+ #=> "Parse error: unexpected $end '' at 1:6"
59
+ ```
23
60
 
24
- After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
61
+ Use `Parselly.sanitize` to escape text for a CSS identifier:
25
62
 
26
- To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
63
+ ```ruby
64
+ Parselly.sanitize('1st item')
65
+ #=> "\\31 st\\ item"
66
+ ```
67
+
68
+ ## Development
69
+
70
+ ```bash
71
+ bin/setup
72
+ bundle exec rake
73
+ ```
27
74
 
28
- ## Contributing
75
+ ## License
29
76
 
30
- Bug reports and pull requests are welcome on GitHub at https://github.com/ydah/parselly.
77
+ MIT
data/Rakefile CHANGED
@@ -7,6 +7,16 @@ namespace 'build' do
7
7
  task :parser do
8
8
  sh 'bundle exec racc parser.y --embedded --frozen -o lib/parselly/parser.rb -t --log-file=parser.output'
9
9
  end
10
+
11
+ desc 'verify generated parser files are in sync'
12
+ task check_parser: :parser do
13
+ sh 'git diff --exit-code lib/parselly/parser.rb parser.output'
14
+ end
15
+ end
16
+
17
+ desc 'run parser benchmarks'
18
+ task :benchmark do
19
+ ruby 'benchmark/parser_benchmark.rb'
10
20
  end
11
21
 
12
22
  require 'rspec/core/rake_task'
@@ -5,12 +5,76 @@ require 'strscan'
5
5
  module Parselly
6
6
  class Lexer
7
7
  Identifier = Struct.new(:value, :raw) do
8
+ attr_accessor :position
9
+
8
10
  def to_s
9
11
  value
10
12
  end
13
+
14
+ def ==(other)
15
+ other.respond_to?(:value) ? value == other.value : value == other
16
+ end
17
+ end
18
+
19
+ TokenValue = Struct.new(:value, :raw, :position, :quote, keyword_init: true) do
20
+ def to_s
21
+ value.to_s
22
+ end
23
+
24
+ def ==(other)
25
+ other.respond_to?(:value) ? value == other.value : value == other
26
+ end
27
+ end
28
+
29
+ Token = Struct.new(:type, :value, :position, keyword_init: true) do
30
+ def [](index)
31
+ to_ary[index]
32
+ end
33
+
34
+ def []=(index, new_value)
35
+ case index
36
+ when 0
37
+ self.type = new_value
38
+ when 1
39
+ self.value = new_value
40
+ when 2
41
+ self.position = new_value
42
+ else
43
+ raise IndexError, "index #{index} outside of token"
44
+ end
45
+ end
46
+
47
+ def first
48
+ type
49
+ end
50
+
51
+ def last
52
+ position
53
+ end
54
+
55
+ def to_ary
56
+ [type, value, position]
57
+ end
58
+
59
+ alias to_a to_ary
60
+
61
+ def ==(other)
62
+ return super unless other.respond_to?(:to_ary)
63
+
64
+ other_type, other_value, other_position = other.to_ary
65
+ return false unless type == other_type
66
+ return false unless value == other_value
67
+ return position == other_position unless position.is_a?(Hash) && other_position.is_a?(Hash)
68
+
69
+ other_position.all? { |key, expected| position[key] == expected }
70
+ end
11
71
  end
12
72
 
13
73
  TOKENS = {
74
+ # Namespace and column combinators
75
+ '|' => :PIPE,
76
+ '||' => :COLUMN,
77
+
14
78
  # Combinators
15
79
  '>' => :CHILD,
16
80
  '+' => :ADJACENT,
@@ -37,27 +101,42 @@ module Parselly
37
101
  '*=' => :SUBSTRINGMATCH
38
102
  }.freeze
39
103
 
40
- # Pre-compiled regular expressions for better performance
41
- MULTI_CHAR_OPERATORS = [
42
- [/~=/, :INCLUDES],
43
- [/\|=/, :DASHMATCH],
44
- [/\^=/, :PREFIXMATCH],
45
- [/\$=/, :SUFFIXMATCH],
46
- [/\*=/, :SUBSTRINGMATCH]
47
- ].freeze
48
-
49
- SINGLE_CHAR_OPERATOR_REGEX = /[>+~\[\]():,.#*=-]/.freeze
50
- WHITESPACE_REGEX = /[ \t\n\r]+/.freeze
51
- STRING_DOUBLE_REGEX = /"([^"\\]|\\.)*"/.freeze
52
- STRING_SINGLE_REGEX = /'([^'\\]|\\.)*'/.freeze
53
- IDENTIFIER_REGEX = /(?:--|-?[a-zA-Z_])(?:[\w-]|\\[^\n\r\f])*/.freeze
104
+ MULTI_CHAR_TOKENS = {
105
+ '~=' => :INCLUDES,
106
+ '|=' => :DASHMATCH,
107
+ '^=' => :PREFIXMATCH,
108
+ '$=' => :SUFFIXMATCH,
109
+ '*=' => :SUBSTRINGMATCH,
110
+ '||' => :COLUMN
111
+ }.freeze
112
+
113
+ SINGLE_CHAR_OPERATOR_REGEX = /[|>+~\[\]():,.#*=-]/.freeze
114
+ WHITESPACE_REGEX = /[ \t\n\r\f]+/.freeze
115
+ COMMENT_REGEX = %r{/\*[^*]*\*+(?:[^/*][^*]*\*+)*/}.freeze
116
+ ESCAPE_SEQUENCE = /\\(?:[0-9a-fA-F]{1,6}[ \t\n\r\f]?|[^\n\r\f])/.freeze
117
+ IDENTIFIER_REGEX = /
118
+ (?:
119
+ --
120
+ |
121
+ -?(?:[a-zA-Z_]|[^\x00-\x7F]|#{ESCAPE_SEQUENCE})
122
+ )
123
+ (?:[a-zA-Z0-9_-]|[^\x00-\x7F]|#{ESCAPE_SEQUENCE})*
124
+ /x.freeze
54
125
  NUMBER_REGEX = /\d+(\.\d+)?/.freeze
55
- ESCAPE_REGEX = /\\(.)/.freeze
126
+ HEX_ESCAPE_REGEX = /\\([0-9a-fA-F]{1,6})([ \t\n\r\f])?/.freeze
127
+ ESCAPED_NEWLINE_REGEX = /\\(?:\r\n|[\n\r\f])/.freeze
128
+ SIMPLE_ESCAPE_REGEX = /\\([^\n\r\f])/.freeze
129
+ REPLACEMENT_CHARACTER = "\uFFFD"
56
130
 
57
131
  attr_reader :line, :column
58
132
 
59
133
  def initialize(input)
60
- @scanner = StringScanner.new(input)
134
+ unless input.valid_encoding?
135
+ raise_lexer_error('Invalid input encoding', { line: 1, column: 1, offset: 0 })
136
+ end
137
+
138
+ preprocessed_input, @offset_map = preprocess_input(input)
139
+ @scanner = StringScanner.new(preprocessed_input)
61
140
  @line = 1
62
141
  @column = 1
63
142
  @tokens = []
@@ -65,54 +144,99 @@ module Parselly
65
144
 
66
145
  def tokenize
67
146
  until @scanner.eos?
68
- skip_whitespace
147
+ skip_ignored
69
148
  break if @scanner.eos?
70
149
 
71
- pos = { line: @line, column: @column, offset: @scanner.pos }
150
+ start_position = current_position
72
151
 
73
- if (token = scan_string)
74
- @tokens << [:STRING, token, pos]
75
- elsif (token = scan_number)
76
- @tokens << [:NUMBER, token, pos]
77
- elsif (token = scan_operator)
78
- @tokens << [token, @scanner.matched, pos]
79
- elsif (token = scan_identifier)
80
- @tokens << [:IDENT, token, pos]
152
+ if (token = scan_string(start_position))
153
+ type, value = token
154
+ @tokens << build_token(type, value, start_position)
155
+ elsif (value = scan_number)
156
+ @tokens << build_token(:NUMBER, value, start_position)
157
+ elsif (type = scan_operator)
158
+ @tokens << build_token(type, @scanner.matched, start_position)
159
+ elsif (value = scan_identifier(start_position))
160
+ @tokens << build_token(:IDENT, value, start_position)
81
161
  else
82
162
  char = @scanner.getch
83
- raise "Unexpected character: #{char} at #{pos[:line]}:#{pos[:column]} (offset #{pos[:offset]})"
163
+ raise_lexer_error("Unexpected character: #{char}", start_position)
84
164
  end
85
165
  end
86
166
 
87
- @tokens << [false, nil, { line: @line, column: @column, offset: @scanner.pos }]
167
+ @tokens << Token.new(type: false, value: nil, position: eof_position)
88
168
  @tokens
89
169
  end
90
170
 
91
171
  private
92
172
 
93
- def skip_whitespace
94
- while @scanner.scan(WHITESPACE_REGEX)
95
- matched = @scanner.matched
96
- newline_count = matched.count("\n")
97
- if newline_count > 0
98
- @line += newline_count
99
- @column = matched.size - matched.rindex("\n")
173
+ def preprocess_input(input)
174
+ output = +''
175
+ offset_map = { 0 => 0 }
176
+ chars = input.each_char.to_a
177
+ original_offset = 0
178
+ index = 0
179
+
180
+ while index < chars.length
181
+ char = chars[index]
182
+ original_start = original_offset
183
+ original_offset += char.bytesize
184
+
185
+ if char == "\r"
186
+ if chars[index + 1] == "\n"
187
+ index += 1
188
+ original_offset += chars[index].bytesize
189
+ end
190
+ append_preprocessed(output, offset_map, "\n", original_start, original_offset)
191
+ elsif char == "\f"
192
+ append_preprocessed(output, offset_map, "\n", original_start, original_offset)
193
+ elsif char == "\0" || surrogate_codepoint?(char)
194
+ append_preprocessed(output, offset_map, REPLACEMENT_CHARACTER, original_start, original_offset)
100
195
  else
101
- @column += matched.size
196
+ append_preprocessed(output, offset_map, char, original_start, original_offset)
102
197
  end
198
+
199
+ index += 1
103
200
  end
201
+
202
+ offset_map[output.bytesize] = original_offset
203
+ [output, offset_map]
104
204
  end
105
205
 
106
- def scan_operator
107
- # Check multi-character operators first
108
- MULTI_CHAR_OPERATORS.each do |regex, token|
109
- if @scanner.scan(regex)
206
+ def append_preprocessed(output, offset_map, value, original_start, original_end)
207
+ offset_map[output.bytesize] = original_start
208
+ output << value
209
+ offset_map[output.bytesize] = original_end
210
+ end
211
+
212
+ def surrogate_codepoint?(char)
213
+ char.ord.between?(0xD800, 0xDFFF)
214
+ end
215
+
216
+ def skip_ignored
217
+ loop do
218
+ if @scanner.scan(WHITESPACE_REGEX)
219
+ update_position(@scanner.matched)
220
+ elsif @scanner.peek(2) == '/*'
221
+ pos = { line: @line, column: @column, offset: @scanner.pos }
222
+ unless @scanner.scan(COMMENT_REGEX)
223
+ raise_lexer_error('Unterminated comment', pos)
224
+ end
110
225
  update_position(@scanner.matched)
111
- return token
226
+ else
227
+ break
112
228
  end
113
229
  end
230
+ end
231
+
232
+ def scan_operator
233
+ two_chars = @scanner.peek(2)
234
+ if (token = MULTI_CHAR_TOKENS[two_chars])
235
+ @scanner.pos += 2
236
+ update_position(two_chars)
237
+ return token
238
+ end
114
239
 
115
- # Single character operators
116
240
  return unless @scanner.scan(SINGLE_CHAR_OPERATOR_REGEX)
117
241
 
118
242
  char = @scanner.matched
@@ -120,25 +244,26 @@ module Parselly
120
244
  TOKENS[char]
121
245
  end
122
246
 
123
- # NOTE: Unlike identifiers (where backslash escapes are processed),
124
- # escape sequences inside strings (e.g., \n, \", \', \\) are NOT processed.
125
- # The raw string content is returned as-is after removing outer quotes.
126
- # This is a known limitation for attribute values, as strings are treated
127
- # as raw text for simplicity. Identifiers process escapes to support patterns
128
- # like .hover\:bg-blue-500, but strings in attributes don't require this.
129
- def scan_string
130
- if @scanner.scan(STRING_DOUBLE_REGEX)
131
- str = @scanner.matched
132
- update_position(str)
133
- str[1..-2] # Remove quotes
134
- elsif @scanner.scan(STRING_SINGLE_REGEX)
135
- str = @scanner.matched
136
- update_position(str)
137
- str[1..-2] # Remove quotes
247
+ def scan_string(position)
248
+ quote = @scanner.peek(1)
249
+ return unless quote == '"' || quote == "'"
250
+
251
+ @scanner.getch
252
+ update_position(quote)
253
+ raw = +''
254
+
255
+ until @scanner.eos?
256
+ char = @scanner.peek(1)
257
+ return build_string_token(:STRING, raw, position, quote) if char == quote && consume_string_char(raw)
258
+ return build_string_token(:BAD_STRING, raw, position, quote) if newline?(char)
259
+
260
+ consume_string_char(raw)
138
261
  end
262
+
263
+ build_string_token(:STRING, raw, position, quote)
139
264
  end
140
265
 
141
- def scan_identifier
266
+ def scan_identifier(position)
142
267
  # Match identifiers with optional escape sequences
143
268
  # CSS allows \<any-char> as escape in identifiers (e.g., .hover\:bg-blue-500)
144
269
  #
@@ -150,9 +275,7 @@ module Parselly
150
275
 
151
276
  ident = @scanner.matched
152
277
  update_position(ident)
153
- # Remove backslashes from escaped characters
154
- normalized = ident.gsub(ESCAPE_REGEX, '\1')
155
- Identifier.new(normalized, ident)
278
+ Identifier.new(unescape_css(ident), ident).tap { |identifier| identifier.position = position }
156
279
  end
157
280
 
158
281
  def scan_number
@@ -163,15 +286,102 @@ module Parselly
163
286
  num
164
287
  end
165
288
 
289
+ def consume_string_char(raw)
290
+ char = @scanner.getch
291
+ update_position(char)
292
+ return true if char == '"' || char == "'"
293
+
294
+ raw << char
295
+ return true unless char == '\\'
296
+ return true if @scanner.eos?
297
+
298
+ escaped = @scanner.getch
299
+ update_position(escaped)
300
+ raw << escaped
301
+ true
302
+ end
303
+
304
+ def build_string_token(type, raw, position, quote)
305
+ [type, TokenValue.new(value: unescape_css(raw), raw: raw, position: position, quote: quote)]
306
+ end
307
+
308
+ def newline?(char)
309
+ char == "\n" || char == "\r" || char == "\f"
310
+ end
311
+
166
312
  def update_position(text)
167
- text.each_char do |char|
168
- if char == "\n"
169
- @line += 1
170
- @column = 1
171
- else
172
- @column += 1
173
- end
313
+ unless text.match?(/[\n\r\f]/)
314
+ @column += text.each_char.count
315
+ return
174
316
  end
317
+
318
+ lines = text.split(/\r\n|[\n\r\f]/, -1)
319
+ @line += lines.length - 1
320
+ @column = lines.last.each_char.count + 1
321
+ end
322
+
323
+ def current_position
324
+ { line: @line, column: @column, offset: original_offset(@scanner.pos) }
325
+ end
326
+
327
+ def original_offset(preprocessed_offset)
328
+ @offset_map.fetch(preprocessed_offset, preprocessed_offset)
329
+ end
330
+
331
+ def build_token(type, value, start_position)
332
+ position = start_position.merge(
333
+ start_line: start_position[:line],
334
+ start_column: start_position[:column],
335
+ start_offset: start_position[:offset],
336
+ end_line: @line,
337
+ end_column: @column,
338
+ end_offset: original_offset(@scanner.pos)
339
+ )
340
+
341
+ value.position = position if value.respond_to?(:position=)
342
+ Token.new(type: type, value: value, position: position)
343
+ end
344
+
345
+ def eof_position
346
+ current_position.merge(
347
+ start_line: @line,
348
+ start_column: @column,
349
+ start_offset: original_offset(@scanner.pos),
350
+ end_line: @line,
351
+ end_column: @column,
352
+ end_offset: original_offset(@scanner.pos)
353
+ )
354
+ end
355
+
356
+ def unescape_css(value)
357
+ value
358
+ .gsub(ESCAPED_NEWLINE_REGEX, '')
359
+ .gsub(HEX_ESCAPE_REGEX) { decode_hex_escape(Regexp.last_match(1)) }
360
+ .gsub(SIMPLE_ESCAPE_REGEX, '\1')
361
+ end
362
+
363
+ def decode_hex_escape(hex)
364
+ codepoint = hex.to_i(16)
365
+ return REPLACEMENT_CHARACTER if codepoint.zero? || codepoint > 0x10FFFF
366
+
367
+ codepoint.chr(Encoding::UTF_8)
368
+ rescue RangeError
369
+ REPLACEMENT_CHARACTER
370
+ end
371
+
372
+ def raise_lexer_error(message, position)
373
+ error = {
374
+ message: "#{message} at #{position[:line]}:#{position[:column]} (offset #{position[:offset]})",
375
+ line: position[:line],
376
+ column: position[:column],
377
+ offset: position[:offset]
378
+ }
379
+
380
+ if defined?(Parselly::LexError)
381
+ raise Parselly::LexError, error
382
+ end
383
+
384
+ raise error[:message]
175
385
  end
176
386
  end
177
387
  end