p_css 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/css/tokenizer.rb +42 -17
- data/lib/css/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: df1cd693075fe04da6a0c9ce4c65c9c4ef5f85c5b84bb82bd0136cf62fe52552
|
|
4
|
+
data.tar.gz: 9a9fc875c3872c49396c5b6c753a099ba3cc344efa7aa97745958d277630c7cb
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 28c76784dac592aa39cfaa5100b69e4765861ec2ab1ed536fb95dfb963481de2bd64d6b6b954f71b14b17ea4a328cc65af6f7a63df662f2083c2ef5ec623e892
|
|
7
|
+
data.tar.gz: 291d12b999205c032e7cebaa5da4b69aff717cb9f7b9743c3c54ce99539b59ec49612b87e9bcf105d374cc79871d8955fc57f18200b8ba76c396894db14f349f
|
data/lib/css/tokenizer.rb
CHANGED
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
module CSS
|
|
2
2
|
# Tokenizer based on CSS Syntax Module Level 3/4 §4.
|
|
3
3
|
# https://www.w3.org/TR/css-syntax-3/#tokenization
|
|
4
|
+
#
|
|
5
|
+
# Not thread-safe: an instance carries mutable cursors (`@pos`,
|
|
6
|
+
# `@newline_cursor`) that advance over the input. Allocate one
|
|
7
|
+
# tokenizer per thread.
|
|
4
8
|
class Tokenizer
|
|
5
9
|
include CodePoints
|
|
6
10
|
|
|
@@ -21,9 +25,10 @@ module CSS
|
|
|
21
25
|
PREPROCESS_RE = /\r\n?|\f|\0/.freeze
|
|
22
26
|
|
|
23
27
|
def initialize(input, preserve_comments: false)
|
|
24
|
-
@
|
|
28
|
+
@chars = preprocess(input)
|
|
25
29
|
@pos = 0
|
|
26
|
-
@newlines = collect_newline_offsets(@
|
|
30
|
+
@newlines = collect_newline_offsets(@chars)
|
|
31
|
+
@newline_cursor = 0
|
|
27
32
|
@preserve_comments = preserve_comments
|
|
28
33
|
end
|
|
29
34
|
|
|
@@ -43,7 +48,7 @@ module CSS
|
|
|
43
48
|
def next_token
|
|
44
49
|
consume_comments unless @preserve_comments
|
|
45
50
|
|
|
46
|
-
return Token.new(:eof) if @pos >= @
|
|
51
|
+
return Token.new(:eof) if @pos >= @chars.length
|
|
47
52
|
|
|
48
53
|
start_offset = @pos
|
|
49
54
|
tok = consume_one_token
|
|
@@ -127,18 +132,25 @@ module CSS
|
|
|
127
132
|
end
|
|
128
133
|
end
|
|
129
134
|
|
|
135
|
+
# Random access on a non-ascii-only UTF-8 String is O(distance from
|
|
136
|
+
# the cached character index), and the peek-ahead pattern (`peek`,
|
|
137
|
+
# `peek(1)`, `peek(2)`) defeats the cache — empirically ~200× slower
|
|
138
|
+
# than indexing a flat Array. Splitting into `chars` once amortizes
|
|
139
|
+
# the UTF-8 walk and gives us O(1) random access for the rest of
|
|
140
|
+
# tokenization.
|
|
130
141
|
def preprocess(input)
|
|
131
|
-
input
|
|
132
|
-
|
|
133
|
-
|
|
142
|
+
input
|
|
143
|
+
.encode('UTF-8')
|
|
144
|
+
.gsub(PREPROCESS_RE) { $~[0] == "\0" ? CodePoints::REPLACEMENT : "\n" }
|
|
145
|
+
.chars
|
|
134
146
|
end
|
|
135
147
|
|
|
136
148
|
def peek(offset = 0)
|
|
137
|
-
@
|
|
149
|
+
@chars[@pos + offset]
|
|
138
150
|
end
|
|
139
151
|
|
|
140
152
|
def consume
|
|
141
|
-
c = @
|
|
153
|
+
c = @chars[@pos]
|
|
142
154
|
return nil if c.nil?
|
|
143
155
|
|
|
144
156
|
@pos += 1
|
|
@@ -149,21 +161,34 @@ module CSS
|
|
|
149
161
|
@pos -= 1
|
|
150
162
|
end
|
|
151
163
|
|
|
152
|
-
def collect_newline_offsets(
|
|
164
|
+
def collect_newline_offsets(chars)
|
|
153
165
|
offsets = []
|
|
154
|
-
i =
|
|
166
|
+
i = 0
|
|
167
|
+
n = chars.length
|
|
168
|
+
|
|
169
|
+
while i < n
|
|
170
|
+
offsets << i if chars[i] == "\n"
|
|
171
|
+
i += 1
|
|
172
|
+
end
|
|
155
173
|
|
|
156
|
-
offsets << i while (i = input.index("\n", i + 1))
|
|
157
174
|
offsets
|
|
158
175
|
end
|
|
159
176
|
|
|
160
|
-
# Newline characters themselves are reported as belonging to the
|
|
161
|
-
# they terminate (col = offset + 1 on line 1, etc).
|
|
177
|
+
# Newline characters themselves are reported as belonging to the
|
|
178
|
+
# line they terminate (col = offset + 1 on line 1, etc).
|
|
179
|
+
#
|
|
180
|
+
# Tokens are emitted in order, so the offsets passed in are
|
|
181
|
+
# monotonically non-decreasing. We keep a running cursor into
|
|
182
|
+
# `@newlines` and advance linearly — amortized O(1) per call,
|
|
183
|
+
# vs. O(log n) per call with a fresh `bsearch`.
|
|
162
184
|
def line_column_at(offset)
|
|
163
|
-
|
|
164
|
-
|
|
185
|
+
while @newline_cursor < @newlines.size && @newlines[@newline_cursor] < offset
|
|
186
|
+
@newline_cursor += 1
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
prev_nl = @newline_cursor.zero? ? -1 : @newlines[@newline_cursor - 1]
|
|
165
190
|
|
|
166
|
-
[
|
|
191
|
+
[@newline_cursor + 1, offset - prev_nl]
|
|
167
192
|
end
|
|
168
193
|
|
|
169
194
|
def whitespace?(c)
|
|
@@ -242,7 +267,7 @@ module CSS
|
|
|
242
267
|
end
|
|
243
268
|
|
|
244
269
|
def eof?
|
|
245
|
-
@pos >= @
|
|
270
|
+
@pos >= @chars.length
|
|
246
271
|
end
|
|
247
272
|
|
|
248
273
|
def consume_whitespace
|
data/lib/css/version.rb
CHANGED