p_css 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 772060eec5d726253913cd8be2daa6180429680d014c1752c9b26b15618e4ba8
4
- data.tar.gz: edd5e5afc5871362dc21cca89d8eb6a5b085350022f79f2b8c6f07a032f07aaa
3
+ metadata.gz: 20afa2206ed855fdd796f19179d06a8dc8b76231c223560d59e664cb43ddf897
4
+ data.tar.gz: e951f89d04ff6db6f68151ad05a414e4f0f7d05f2dfc48580b22ef31b6f949de
5
5
  SHA512:
6
- metadata.gz: 7b7e331023830f09938bbd03a29f50c49b19b40169226a9c13866c1ce2436b0dcd71987fb183f403b7b98784c68c8e8426e6043dd3757ebe5343e37862e2c228
7
- data.tar.gz: c06594e2e861c77e56cc32dcb1980b3fba8bfe0175fa1f108e4b0d02bef929efd59274e5c53f04ecea8245b47c583f41eb8a4396a5689052a569af54e8026f5a
6
+ metadata.gz: dc533dd2a146654d7a622b3206568168bea1e404b163ef6b24ae1c841ef4cd5ff3a4621bc6794f64178ffd5c87a0c4e6a46f70fd6c7999a52539091849ae2941
7
+ data.tar.gz: 67d08837559466bc5713aa6a40d8e67030ff9389228527b4ed8b4bb5e1121965fa90a31e6cf5aae40167151e535ae83b9f6073068e72d495a5098559b331698d
@@ -1,36 +1,59 @@
1
1
  module CSS
2
2
  # Character class predicates from CSS Syntax §4.2 Definitions, plus the
3
3
  # U+FFFD replacement character used both during tokenization and
4
- # serialization. Implemented with char comparisons rather than regex to
5
- # avoid pattern-match overhead in the tokenizer's inner loop.
4
+ # serialization.
5
+ #
6
+ # ASCII bytes are looked up in a precomputed boolean table (one Array
7
+ # access + one branch); non-ASCII code points (>= 0x80) are always
8
+ # ident-cp / ident-start per spec, so the helpers fall back to a single
9
+ # `c.ord >= 0x80` check. Avoids the chain of `String#<=>` calls a
10
+ # range-style predicate would dispatch.
6
11
  module CodePoints
7
12
  REPLACEMENT = "�".freeze
8
13
 
14
+ def self.build_table(*ranges_or_ints)
15
+ Array.new(128, false).tap {|a|
16
+ ranges_or_ints.each {|r|
17
+ if r.is_a?(Range) then r.each { a[it] = true }
18
+ else a[r] = true
19
+ end
20
+ }
21
+ }.freeze
22
+ end
23
+
24
+ DIGIT_TABLE = build_table(0x30..0x39)
25
+ HEX_DIGIT_TABLE = build_table(0x30..0x39, 0x41..0x46, 0x61..0x66)
26
+ IDENT_START_TABLE = build_table(0x41..0x5A, 0x61..0x7A, 0x5F)
27
+ IDENT_CP_TABLE = build_table(0x30..0x39, 0x41..0x5A, 0x61..0x7A, 0x5F, 0x2D)
28
+
9
29
  module_function
10
30
 
11
31
  def digit?(c)
12
- !c.nil? && c >= '0' && c <= '9'
32
+ return false if c.nil?
33
+
34
+ o = c.ord
35
+ o < 128 && DIGIT_TABLE[o]
13
36
  end
14
37
 
15
38
  def hex_digit?(c)
16
39
  return false if c.nil?
17
40
 
18
- (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')
41
+ o = c.ord
42
+ o < 128 && HEX_DIGIT_TABLE[o]
19
43
  end
20
44
 
21
45
  def ident_start_code_point?(c)
22
46
  return false if c.nil?
23
- return true if c == '_' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
24
47
 
25
- c.ord >= 0x80
48
+ o = c.ord
49
+ o >= 128 || IDENT_START_TABLE[o]
26
50
  end
27
51
 
28
52
  def ident_code_point?(c)
29
53
  return false if c.nil?
30
- return true if c == '_' || c == '-' || (c >= '0' && c <= '9')
31
- return true if (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
32
54
 
33
- c.ord >= 0x80
55
+ o = c.ord
56
+ o >= 128 || IDENT_CP_TABLE[o]
34
57
  end
35
58
  end
36
59
  end
data/lib/css/token.rb CHANGED
@@ -17,7 +17,7 @@ module CSS
17
17
  eof
18
18
  ].freeze
19
19
 
20
- attr_reader :type, :value, :flag, :unit, :position
20
+ attr_reader :type, :value, :flag, :unit
21
21
 
22
22
  def initialize(type, value = nil, flag: nil, unit: nil, position: nil)
23
23
  raise ArgumentError, "unknown token type: #{type.inspect}" unless TYPES.include?(type)
@@ -58,21 +58,50 @@ module CSS
58
58
  type == :whitespace || type == :comment
59
59
  end
60
60
 
61
- # Mutating: assigns the token's source position and returns self. Used
62
- # by the tokenizer so each token requires only a single allocation.
63
- def assign_position!(pos)
64
- @position = pos
61
+ # Most tokens never have their `position` read after parsing, so the
62
+ # tokenizer plants raw offsets + a shared `@newlines` reference here
63
+ # via this method, and `Token#position` materializes the `Position`
64
+ # Data on first read.
65
+ def assign_source!(start_offset, end_offset, newlines)
66
+ @start_offset = start_offset
67
+ @end_offset = end_offset
68
+ @newlines = newlines
65
69
  self
66
70
  end
67
71
 
72
+ # Returns nil for tokens built without source info (i.e. tokens
73
+ # constructed by hand or via `Token.new(:eof)`).
74
+ def position
75
+ return @position if @position
76
+ return nil unless instance_variable_defined?(:@start_offset)
77
+
78
+ @position = compute_position
79
+ end
80
+
81
+ # Reads `@position` directly so debug-style introspection doesn't
82
+ # materialize a `Position` as a side effect.
68
83
  def inspect
69
84
  parts = ["type=#{type.inspect}"]
70
85
  parts << "value=#{value.inspect}" unless value.nil?
71
86
  parts << "flag=#{flag.inspect}" unless flag.nil?
72
87
  parts << "unit=#{unit.inspect}" unless unit.nil?
73
- parts << "@#{position}" unless position.nil?
88
+ parts << "@#{@position}" if @position
74
89
 
75
90
  "#<CSS::Token #{parts.join(' ')}>"
76
91
  end
92
+
93
+ private
94
+
95
+ def compute_position
96
+ idx = @newlines.bsearch_index { it >= @start_offset } || @newlines.size
97
+ prev_nl = idx.zero? ? -1 : @newlines[idx - 1]
98
+
99
+ Position.new(
100
+ line: idx + 1,
101
+ column: @start_offset - prev_nl,
102
+ offset: @start_offset,
103
+ end_offset: @end_offset
104
+ )
105
+ end
77
106
  end
78
107
  end
data/lib/css/tokenizer.rb CHANGED
@@ -1,6 +1,9 @@
1
1
  module CSS
2
2
  # Tokenizer based on CSS Syntax Module Level 3/4 §4.
3
3
  # https://www.w3.org/TR/css-syntax-3/#tokenization
4
+ #
5
+ # Not thread-safe: an instance carries a mutable cursor (`@pos`) that
6
+ # advances over the input. Allocate one tokenizer per thread.
4
7
  class Tokenizer
5
8
  include CodePoints
6
9
 
@@ -21,9 +24,10 @@ module CSS
21
24
  PREPROCESS_RE = /\r\n?|\f|\0/.freeze
22
25
 
23
26
  def initialize(input, preserve_comments: false)
24
- @input = preprocess(input)
27
+ @chars = preprocess(input)
28
+ @length = @chars.length
25
29
  @pos = 0
26
- @newlines = collect_newline_offsets(@input)
30
+ @newlines = collect_newline_offsets(@chars)
27
31
  @preserve_comments = preserve_comments
28
32
  end
29
33
 
@@ -43,13 +47,12 @@ module CSS
43
47
  def next_token
44
48
  consume_comments unless @preserve_comments
45
49
 
46
- return Token.new(:eof) if @pos >= @input.length
50
+ return Token.new(:eof) if @pos >= @length
47
51
 
48
52
  start_offset = @pos
49
53
  tok = consume_one_token
50
- line, column = line_column_at(start_offset)
51
54
 
52
- tok.assign_position!(Position.new(line:, column:, offset: start_offset, end_offset: @pos))
55
+ tok.assign_source!(start_offset, @pos, @newlines)
53
56
  end
54
57
 
55
58
  private
@@ -127,18 +130,25 @@ module CSS
127
130
  end
128
131
  end
129
132
 
133
+ # Random access on a non-ascii-only UTF-8 String is O(distance from
134
+ # the cached character index), and the peek-ahead pattern (`peek`,
135
+ # `peek(1)`, `peek(2)`) defeats the cache — empirically ~200× slower
136
+ # than indexing a flat Array. Splitting into `chars` once amortizes
137
+ # the UTF-8 walk and gives us O(1) random access for the rest of
138
+ # tokenization.
130
139
  def preprocess(input)
131
- input.encode('UTF-8').gsub(PREPROCESS_RE) {
132
- $~[0] == "\0" ? CodePoints::REPLACEMENT : "\n"
133
- }
140
+ input
141
+ .encode('UTF-8')
142
+ .gsub(PREPROCESS_RE) { $~[0] == "\0" ? CodePoints::REPLACEMENT : "\n" }
143
+ .chars
134
144
  end
135
145
 
136
146
  def peek(offset = 0)
137
- @input[@pos + offset]
147
+ @chars[@pos + offset]
138
148
  end
139
149
 
140
150
  def consume
141
- c = @input[@pos]
151
+ c = @chars[@pos]
142
152
  return nil if c.nil?
143
153
 
144
154
  @pos += 1
@@ -149,21 +159,17 @@ module CSS
149
159
  @pos -= 1
150
160
  end
151
161
 
152
- def collect_newline_offsets(input)
162
+ def collect_newline_offsets(chars)
153
163
  offsets = []
154
- i = -1
164
+ i = 0
165
+ n = chars.length
155
166
 
156
- offsets << i while (i = input.index("\n", i + 1))
157
- offsets
158
- end
159
-
160
- # Newline characters themselves are reported as belonging to the line
161
- # they terminate (col = offset + 1 on line 1, etc).
162
- def line_column_at(offset)
163
- idx = @newlines.bsearch_index { it >= offset } || @newlines.size
164
- prev_nl = idx.zero? ? -1 : @newlines[idx - 1]
167
+ while i < n
168
+ offsets << i if chars[i] == "\n"
169
+ i += 1
170
+ end
165
171
 
166
- [idx + 1, offset - prev_nl]
172
+ offsets
167
173
  end
168
174
 
169
175
  def whitespace?(c)
@@ -242,7 +248,7 @@ module CSS
242
248
  end
243
249
 
244
250
  def eof?
245
- @pos >= @input.length
251
+ @pos >= @length
246
252
  end
247
253
 
248
254
  def consume_whitespace
data/lib/css/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module CSS
2
- VERSION = '0.1.2'
2
+ VERSION = '0.1.4'
3
3
  end
data/sig/css/token.rbs CHANGED
@@ -22,7 +22,7 @@ module CSS
22
22
  def comment?: () -> bool
23
23
  def trivia?: () -> bool
24
24
 
25
- def assign_position!: (Position pos) -> self
25
+ def assign_source!: (Integer start_offset, Integer end_offset, Array[Integer] newlines) -> self
26
26
 
27
27
  def ==: (untyped other) -> bool
28
28
  def eql?: (untyped other) -> bool
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: p_css
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Keita Urashima