p_css 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/css/code_points.rb +32 -9
- data/lib/css/token.rb +35 -6
- data/lib/css/tokenizer.rb +29 -23
- data/lib/css/version.rb +1 -1
- data/sig/css/token.rbs +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 20afa2206ed855fdd796f19179d06a8dc8b76231c223560d59e664cb43ddf897
|
|
4
|
+
data.tar.gz: e951f89d04ff6db6f68151ad05a414e4f0f7d05f2dfc48580b22ef31b6f949de
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: dc533dd2a146654d7a622b3206568168bea1e404b163ef6b24ae1c841ef4cd5ff3a4621bc6794f64178ffd5c87a0c4e6a46f70fd6c7999a52539091849ae2941
|
|
7
|
+
data.tar.gz: 67d08837559466bc5713aa6a40d8e67030ff9389228527b4ed8b4bb5e1121965fa90a31e6cf5aae40167151e535ae83b9f6073068e72d495a5098559b331698d
|
data/lib/css/code_points.rb
CHANGED
|
@@ -1,36 +1,59 @@
|
|
|
1
1
|
module CSS
|
|
2
2
|
# Character class predicates from CSS Syntax §4.2 Definitions, plus the
|
|
3
3
|
# U+FFFD replacement character used both during tokenization and
|
|
4
|
-
# serialization.
|
|
5
|
-
#
|
|
4
|
+
# serialization.
|
|
5
|
+
#
|
|
6
|
+
# ASCII bytes are looked up in a precomputed boolean table (one Array
|
|
7
|
+
# access + one branch); non-ASCII code points (>= 0x80) are always
|
|
8
|
+
# ident-cp / ident-start per spec, so the helpers fall back to a single
|
|
9
|
+
# `c.ord >= 0x80` check. Avoids the chain of `String#<=>` calls a
|
|
10
|
+
# range-style predicate would dispatch.
|
|
6
11
|
module CodePoints
|
|
7
12
|
REPLACEMENT = "�".freeze
|
|
8
13
|
|
|
14
|
+
def self.build_table(*ranges_or_ints)
|
|
15
|
+
Array.new(128, false).tap {|a|
|
|
16
|
+
ranges_or_ints.each {|r|
|
|
17
|
+
if r.is_a?(Range) then r.each { a[it] = true }
|
|
18
|
+
else a[r] = true
|
|
19
|
+
end
|
|
20
|
+
}
|
|
21
|
+
}.freeze
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
DIGIT_TABLE = build_table(0x30..0x39)
|
|
25
|
+
HEX_DIGIT_TABLE = build_table(0x30..0x39, 0x41..0x46, 0x61..0x66)
|
|
26
|
+
IDENT_START_TABLE = build_table(0x41..0x5A, 0x61..0x7A, 0x5F)
|
|
27
|
+
IDENT_CP_TABLE = build_table(0x30..0x39, 0x41..0x5A, 0x61..0x7A, 0x5F, 0x2D)
|
|
28
|
+
|
|
9
29
|
module_function
|
|
10
30
|
|
|
11
31
|
def digit?(c)
|
|
12
|
-
|
|
32
|
+
return false if c.nil?
|
|
33
|
+
|
|
34
|
+
o = c.ord
|
|
35
|
+
o < 128 && DIGIT_TABLE[o]
|
|
13
36
|
end
|
|
14
37
|
|
|
15
38
|
def hex_digit?(c)
|
|
16
39
|
return false if c.nil?
|
|
17
40
|
|
|
18
|
-
|
|
41
|
+
o = c.ord
|
|
42
|
+
o < 128 && HEX_DIGIT_TABLE[o]
|
|
19
43
|
end
|
|
20
44
|
|
|
21
45
|
def ident_start_code_point?(c)
|
|
22
46
|
return false if c.nil?
|
|
23
|
-
return true if c == '_' || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
|
|
24
47
|
|
|
25
|
-
c.ord
|
|
48
|
+
o = c.ord
|
|
49
|
+
o >= 128 || IDENT_START_TABLE[o]
|
|
26
50
|
end
|
|
27
51
|
|
|
28
52
|
def ident_code_point?(c)
|
|
29
53
|
return false if c.nil?
|
|
30
|
-
return true if c == '_' || c == '-' || (c >= '0' && c <= '9')
|
|
31
|
-
return true if (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
|
|
32
54
|
|
|
33
|
-
c.ord
|
|
55
|
+
o = c.ord
|
|
56
|
+
o >= 128 || IDENT_CP_TABLE[o]
|
|
34
57
|
end
|
|
35
58
|
end
|
|
36
59
|
end
|
data/lib/css/token.rb
CHANGED
|
@@ -17,7 +17,7 @@ module CSS
|
|
|
17
17
|
eof
|
|
18
18
|
].freeze
|
|
19
19
|
|
|
20
|
-
attr_reader :type, :value, :flag, :unit
|
|
20
|
+
attr_reader :type, :value, :flag, :unit
|
|
21
21
|
|
|
22
22
|
def initialize(type, value = nil, flag: nil, unit: nil, position: nil)
|
|
23
23
|
raise ArgumentError, "unknown token type: #{type.inspect}" unless TYPES.include?(type)
|
|
@@ -58,21 +58,50 @@ module CSS
|
|
|
58
58
|
type == :whitespace || type == :comment
|
|
59
59
|
end
|
|
60
60
|
|
|
61
|
-
#
|
|
62
|
-
#
|
|
63
|
-
|
|
64
|
-
|
|
61
|
+
# Most tokens never have their `position` read after parsing, so the
|
|
62
|
+
# tokenizer plants raw offsets + a shared `@newlines` reference here
|
|
63
|
+
# via this method, and `Token#position` materializes the `Position`
|
|
64
|
+
# Data on first read.
|
|
65
|
+
def assign_source!(start_offset, end_offset, newlines)
|
|
66
|
+
@start_offset = start_offset
|
|
67
|
+
@end_offset = end_offset
|
|
68
|
+
@newlines = newlines
|
|
65
69
|
self
|
|
66
70
|
end
|
|
67
71
|
|
|
72
|
+
# Returns nil for tokens built without source info (i.e. tokens
|
|
73
|
+
# constructed by hand or via `Token.new(:eof)`).
|
|
74
|
+
def position
|
|
75
|
+
return @position if @position
|
|
76
|
+
return nil unless instance_variable_defined?(:@start_offset)
|
|
77
|
+
|
|
78
|
+
@position = compute_position
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Reads `@position` directly so debug-style introspection doesn't
|
|
82
|
+
# materialize a `Position` as a side effect.
|
|
68
83
|
def inspect
|
|
69
84
|
parts = ["type=#{type.inspect}"]
|
|
70
85
|
parts << "value=#{value.inspect}" unless value.nil?
|
|
71
86
|
parts << "flag=#{flag.inspect}" unless flag.nil?
|
|
72
87
|
parts << "unit=#{unit.inspect}" unless unit.nil?
|
|
73
|
-
parts << "@#{position}"
|
|
88
|
+
parts << "@#{@position}" if @position
|
|
74
89
|
|
|
75
90
|
"#<CSS::Token #{parts.join(' ')}>"
|
|
76
91
|
end
|
|
92
|
+
|
|
93
|
+
private
|
|
94
|
+
|
|
95
|
+
def compute_position
|
|
96
|
+
idx = @newlines.bsearch_index { it >= @start_offset } || @newlines.size
|
|
97
|
+
prev_nl = idx.zero? ? -1 : @newlines[idx - 1]
|
|
98
|
+
|
|
99
|
+
Position.new(
|
|
100
|
+
line: idx + 1,
|
|
101
|
+
column: @start_offset - prev_nl,
|
|
102
|
+
offset: @start_offset,
|
|
103
|
+
end_offset: @end_offset
|
|
104
|
+
)
|
|
105
|
+
end
|
|
77
106
|
end
|
|
78
107
|
end
|
data/lib/css/tokenizer.rb
CHANGED
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
module CSS
|
|
2
2
|
# Tokenizer based on CSS Syntax Module Level 3/4 §4.
|
|
3
3
|
# https://www.w3.org/TR/css-syntax-3/#tokenization
|
|
4
|
+
#
|
|
5
|
+
# Not thread-safe: an instance carries a mutable cursor (`@pos`) that
|
|
6
|
+
# advances over the input. Allocate one tokenizer per thread.
|
|
4
7
|
class Tokenizer
|
|
5
8
|
include CodePoints
|
|
6
9
|
|
|
@@ -21,9 +24,10 @@ module CSS
|
|
|
21
24
|
PREPROCESS_RE = /\r\n?|\f|\0/.freeze
|
|
22
25
|
|
|
23
26
|
def initialize(input, preserve_comments: false)
|
|
24
|
-
@
|
|
27
|
+
@chars = preprocess(input)
|
|
28
|
+
@length = @chars.length
|
|
25
29
|
@pos = 0
|
|
26
|
-
@newlines = collect_newline_offsets(@
|
|
30
|
+
@newlines = collect_newline_offsets(@chars)
|
|
27
31
|
@preserve_comments = preserve_comments
|
|
28
32
|
end
|
|
29
33
|
|
|
@@ -43,13 +47,12 @@ module CSS
|
|
|
43
47
|
def next_token
|
|
44
48
|
consume_comments unless @preserve_comments
|
|
45
49
|
|
|
46
|
-
return Token.new(:eof) if @pos >= @
|
|
50
|
+
return Token.new(:eof) if @pos >= @length
|
|
47
51
|
|
|
48
52
|
start_offset = @pos
|
|
49
53
|
tok = consume_one_token
|
|
50
|
-
line, column = line_column_at(start_offset)
|
|
51
54
|
|
|
52
|
-
tok.
|
|
55
|
+
tok.assign_source!(start_offset, @pos, @newlines)
|
|
53
56
|
end
|
|
54
57
|
|
|
55
58
|
private
|
|
@@ -127,18 +130,25 @@ module CSS
|
|
|
127
130
|
end
|
|
128
131
|
end
|
|
129
132
|
|
|
133
|
+
# Random access on a non-ascii-only UTF-8 String is O(distance from
|
|
134
|
+
# the cached character index), and the peek-ahead pattern (`peek`,
|
|
135
|
+
# `peek(1)`, `peek(2)`) defeats the cache — empirically ~200× slower
|
|
136
|
+
# than indexing a flat Array. Splitting into `chars` once amortizes
|
|
137
|
+
# the UTF-8 walk and gives us O(1) random access for the rest of
|
|
138
|
+
# tokenization.
|
|
130
139
|
def preprocess(input)
|
|
131
|
-
input
|
|
132
|
-
|
|
133
|
-
|
|
140
|
+
input
|
|
141
|
+
.encode('UTF-8')
|
|
142
|
+
.gsub(PREPROCESS_RE) { $~[0] == "\0" ? CodePoints::REPLACEMENT : "\n" }
|
|
143
|
+
.chars
|
|
134
144
|
end
|
|
135
145
|
|
|
136
146
|
def peek(offset = 0)
|
|
137
|
-
@
|
|
147
|
+
@chars[@pos + offset]
|
|
138
148
|
end
|
|
139
149
|
|
|
140
150
|
def consume
|
|
141
|
-
c = @
|
|
151
|
+
c = @chars[@pos]
|
|
142
152
|
return nil if c.nil?
|
|
143
153
|
|
|
144
154
|
@pos += 1
|
|
@@ -149,21 +159,17 @@ module CSS
|
|
|
149
159
|
@pos -= 1
|
|
150
160
|
end
|
|
151
161
|
|
|
152
|
-
def collect_newline_offsets(
|
|
162
|
+
def collect_newline_offsets(chars)
|
|
153
163
|
offsets = []
|
|
154
|
-
i =
|
|
164
|
+
i = 0
|
|
165
|
+
n = chars.length
|
|
155
166
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
# Newline characters themselves are reported as belonging to the line
|
|
161
|
-
# they terminate (col = offset + 1 on line 1, etc).
|
|
162
|
-
def line_column_at(offset)
|
|
163
|
-
idx = @newlines.bsearch_index { it >= offset } || @newlines.size
|
|
164
|
-
prev_nl = idx.zero? ? -1 : @newlines[idx - 1]
|
|
167
|
+
while i < n
|
|
168
|
+
offsets << i if chars[i] == "\n"
|
|
169
|
+
i += 1
|
|
170
|
+
end
|
|
165
171
|
|
|
166
|
-
|
|
172
|
+
offsets
|
|
167
173
|
end
|
|
168
174
|
|
|
169
175
|
def whitespace?(c)
|
|
@@ -242,7 +248,7 @@ module CSS
|
|
|
242
248
|
end
|
|
243
249
|
|
|
244
250
|
def eof?
|
|
245
|
-
@pos >= @
|
|
251
|
+
@pos >= @length
|
|
246
252
|
end
|
|
247
253
|
|
|
248
254
|
def consume_whitespace
|
data/lib/css/version.rb
CHANGED
data/sig/css/token.rbs
CHANGED
|
@@ -22,7 +22,7 @@ module CSS
|
|
|
22
22
|
def comment?: () -> bool
|
|
23
23
|
def trivia?: () -> bool
|
|
24
24
|
|
|
25
|
-
def
|
|
25
|
+
def assign_source!: (Integer start_offset, Integer end_offset, Array[Integer] newlines) -> self
|
|
26
26
|
|
|
27
27
|
def ==: (untyped other) -> bool
|
|
28
28
|
def eql?: (untyped other) -> bool
|